Skip to main content

codelens_engine/symbols/
mod.rs

1mod parser;
2mod ranking;
3mod reader;
4pub mod scoring;
5#[cfg(test)]
6mod tests;
7mod types;
8mod writer;
9
10use parser::{
11    extend_start_to_doc_comments, flatten_symbol_infos, flatten_symbols, parse_symbols,
12    slice_source, to_symbol_info,
13};
14use ranking::prune_to_budget;
15use scoring::score_symbol;
16pub use scoring::{
17    sparse_coverage_bonus_from_fields, sparse_max_bonus, sparse_threshold, sparse_weighting_enabled,
18};
19pub(crate) use types::ReadDb;
20pub use types::{
21    make_symbol_id, parse_symbol_id, IndexStats, RankedContextEntry, RankedContextResult,
22    SymbolInfo, SymbolKind, SymbolProvenance,
23};
24
25use crate::db::{self, content_hash, index_db_path, IndexDb};
26// Re-export language_for_path so downstream crate modules keep working.
27pub(crate) use crate::lang_config::{language_for_path, LanguageConfig};
28use crate::project::ProjectRoot;
29use anyhow::{bail, Context, Result};
30use std::fs;
31use std::path::{Path, PathBuf};
32use std::time::UNIX_EPOCH;
33use walkdir::WalkDir;
34
35use crate::project::{collect_files, is_excluded};
36
37// Types (SymbolKind, SymbolInfo, ParsedSymbol, IndexStats, RankedContextEntry,
38// RankedContextResult, ReadDb) are in types.rs, re-exported above.
39
40/// SQLite-backed symbol index for a project.
41///
42/// Architecture: writer `Mutex<IndexDb>` for mutations + per-query read-only
43/// connections for `_cached` methods. This makes `SymbolIndex: Send + Sync`,
44/// enabling `Arc<SymbolIndex>` without an external Mutex.
45pub struct SymbolIndex {
46    project: ProjectRoot,
47    db_path: PathBuf,
48    writer: std::sync::Mutex<IndexDb>,
49    /// In-memory mode flag (tests) — when true, _cached reads use the writer.
50    in_memory: bool,
51}
52
53impl SymbolIndex {
54    pub fn new(project: ProjectRoot) -> Self {
55        let db_path = index_db_path(project.as_path());
56        let db = IndexDb::open(&db_path).unwrap_or_else(|e| {
57            tracing::warn!(
58                path = %db_path.display(),
59                error = %e,
60                "failed to open DB, falling back to in-memory"
61            );
62            IndexDb::open_memory().unwrap()
63        });
64        let in_memory = !db_path.is_file();
65        let mut idx = Self {
66            project,
67            db_path,
68            writer: std::sync::Mutex::new(db),
69            in_memory,
70        };
71        // Auto-migrate from legacy JSON index if DB is empty
72        if idx.writer().file_count().unwrap_or(0) == 0 {
73            let _ = idx.migrate_from_json();
74        }
75        idx
76    }
77
78    /// Acquire the writer connection (poison-safe).
79    fn writer(&self) -> std::sync::MutexGuard<'_, IndexDb> {
80        self.writer
81            .lock()
82            .unwrap_or_else(|poisoned| poisoned.into_inner())
83    }
84
85    /// Open a read-only DB connection for queries (or fall back to writer for in-memory).
86    fn reader(&self) -> Result<ReadDb<'_>> {
87        if self.in_memory {
88            return Ok(ReadDb::Writer(self.writer()));
89        }
90        match IndexDb::open_readonly(&self.db_path)? {
91            Some(db) => Ok(ReadDb::Owned(db)),
92            None => Ok(ReadDb::Writer(self.writer())),
93        }
94    }
95
96    /// Create an in-memory index (for tests and benchmarks — no disk persistence).
97    pub fn new_memory(project: ProjectRoot) -> Self {
98        let db = IndexDb::open_memory().unwrap();
99        Self {
100            db_path: PathBuf::new(),
101            project,
102            writer: std::sync::Mutex::new(db),
103            in_memory: true,
104        }
105    }
106
107    pub fn stats(&self) -> Result<IndexStats> {
108        let db = self.reader()?;
109        let supported_files = collect_candidate_files(self.project.as_path())?;
110        let indexed_files = db.file_count()?;
111        let indexed_paths = db.all_file_paths()?;
112
113        let mut stale = 0usize;
114        for rel in &indexed_paths {
115            let path = self.project.as_path().join(rel);
116            if !path.is_file() {
117                stale += 1;
118                continue;
119            }
120            let content = match fs::read(&path) {
121                Ok(c) => c,
122                Err(_) => {
123                    stale += 1;
124                    continue;
125                }
126            };
127            let hash = content_hash(&content);
128            let mtime = file_modified_ms(&path).unwrap_or(0) as i64;
129            if db.get_fresh_file(rel, mtime, &hash)?.is_none() {
130                stale += 1;
131            }
132        }
133
134        Ok(IndexStats {
135            indexed_files,
136            supported_files: supported_files.len(),
137            stale_files: stale,
138        })
139    }
140
141    /// SelectSolve file pre-filtering: score files by name relevance to query,
142    /// then extract symbols only from top-scoring files.
143    /// Path-first retrieval with FTS5 boost: file paths scored by query token
144    /// matching, then boosted by FTS5 symbol hits in the same file.
145    fn select_solve_symbols(&self, query: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
146        // Collect file paths and compute top matches inside a block so the
147        // MutexGuard (ReadDb::Writer) is dropped before we call find_symbol /
148        // get_symbols_overview_cached, which also need the lock.  Holding the
149        // guard across those calls causes a deadlock with in-memory DBs.
150        //
151        // FTS5 boost: search each query token as a symbol name via FTS5,
152        // collect which files contain matching symbols, and boost those files.
153        // Token-level search is critical for NL queries like "how does dispatch
154        // work" — the full query won't match any symbol, but "dispatch" will
155        // find dispatch_tool in dispatch/mod.rs.
156        let fts_file_boost: std::collections::HashSet<String> = {
157            let query_lower = query.to_ascii_lowercase();
158            let tokens: Vec<&str> = query_lower
159                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
160                .filter(|t| t.len() >= 3)
161                .collect();
162            let mut boost_files = std::collections::HashSet::new();
163            // First try full query (catches exact symbol names like "dispatch_tool")
164            if let Ok(hits) = self.find_symbol(query, None, false, false, 15) {
165                for sym in hits {
166                    boost_files.insert(sym.file_path);
167                }
168            }
169            // Then try individual tokens (catches NL queries)
170            for token in &tokens {
171                if let Ok(hits) = self.find_symbol(token, None, false, false, 10) {
172                    for sym in hits {
173                        boost_files.insert(sym.file_path);
174                    }
175                }
176            }
177            boost_files
178        };
179
180        let (top_files, importer_files): (Vec<String>, Vec<String>) = {
181            let db = self.reader()?;
182            let all_paths = db.all_file_paths()?;
183
184            let query_lower = query.to_ascii_lowercase();
185            let query_tokens: Vec<&str> = query_lower
186                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
187                .filter(|t| t.len() >= 3)
188                .collect();
189
190            let mut file_scores: Vec<(String, usize)> = all_paths
191                .into_iter()
192                .map(|path| {
193                    let path_lower = path.to_ascii_lowercase();
194                    let mut score = query_tokens
195                        .iter()
196                        .filter(|token| path_lower.contains(**token))
197                        .count();
198                    // FTS5 boost: files containing matching symbols get +2
199                    if fts_file_boost.contains(&path) {
200                        score += 2;
201                    }
202                    (path, score)
203                })
204                .collect();
205
206            file_scores.sort_by(|a, b| b.1.cmp(&a.1));
207            let top: Vec<String> = file_scores
208                .into_iter()
209                .filter(|(_, score)| *score > 0)
210                .take(10)
211                .map(|(path, _)| path)
212                .collect();
213
214            // Import graph proximity: files that import top-matched files
215            // provide structural context (callers, consumers of matched code).
216            let mut importers = Vec::new();
217            if !top.is_empty() && top.len() <= 5 {
218                for file_path in top.iter().take(3) {
219                    if let Ok(imp) = db.get_importers(file_path) {
220                        for importer_path in imp.into_iter().take(3) {
221                            importers.push(importer_path);
222                        }
223                    }
224                }
225            }
226
227            (top, importers)
228            // db (MutexGuard) dropped here
229        };
230
231        // If no file matches (path + FTS5 both empty), fall back to broad symbol search
232        if top_files.is_empty() {
233            return self.find_symbol(query, None, false, false, 500);
234        }
235
236        // Collect symbols from top files
237        let mut all_symbols = Vec::new();
238        for file_path in &top_files {
239            if let Ok(symbols) = self.get_symbols_overview_cached(file_path, depth) {
240                all_symbols.extend(symbols);
241            }
242        }
243
244        // Import graph proximity: include symbols from files that import top matches.
245        // These provide structural context (callers, consumers of matched code).
246        for importer_path in &importer_files {
247            if let Ok(symbols) = self.get_symbols_overview_cached(importer_path, 1) {
248                all_symbols.extend(symbols);
249            }
250        }
251
252        // Also include direct symbol name matches (for exact/substring hits)
253        let mut seen_ids: std::collections::HashSet<String> =
254            all_symbols.iter().map(|s| s.id.clone()).collect();
255
256        if let Ok(direct) = self.find_symbol(query, None, false, false, 50) {
257            for sym in direct {
258                if seen_ids.insert(sym.id.clone()) {
259                    all_symbols.push(sym);
260                }
261            }
262        }
263
264        // For multi-word queries, also search individual tokens as symbol names
265        // (e.g., "dispatch tool call" → search for "dispatch", "tool", "call")
266        let query_lower = query.to_ascii_lowercase();
267        let tokens: Vec<&str> = query_lower
268            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
269            .filter(|t| t.len() >= 3)
270            .collect();
271        if tokens.len() >= 2 {
272            for token in &tokens {
273                match self.find_symbol(token, None, false, false, 10) {
274                    Ok(hits) => {
275                        for sym in hits {
276                            if seen_ids.insert(sym.id.clone()) {
277                                all_symbols.push(sym);
278                            }
279                        }
280                    }
281                    Err(e) => {
282                        tracing::debug!(token, error = %e, "token find_symbol failed");
283                    }
284                }
285            }
286        }
287
288        Ok(all_symbols)
289    }
290
291    /// Hierarchical project structure: per-directory file count + symbol count.
292    /// Used as Level 1 pruning — lets LLM decide which directories to drill into.
293    pub fn get_project_structure(&self) -> Result<Vec<db::DirStats>> {
294        let db = self.reader()?;
295        db.dir_stats()
296    }
297
298    pub fn indexed_file_paths(&self) -> Result<Vec<String>> {
299        let db = self.reader()?;
300        db.all_file_paths()
301    }
302
303    pub fn get_symbols_overview(&self, path: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
304        let resolved = self.project.resolve(path)?;
305        if resolved.is_dir() {
306            let mut symbols = Vec::new();
307            for file in WalkDir::new(&resolved)
308                .into_iter()
309                .filter_entry(|entry| !is_excluded(entry.path()))
310            {
311                let file = file?;
312                if !file.file_type().is_file() || language_for_path(file.path()).is_none() {
313                    continue;
314                }
315                let relative = self.project.to_relative(file.path());
316                let parsed = self.ensure_indexed(file.path(), &relative)?;
317                if !parsed.is_empty() {
318                    let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
319                    symbols.push(SymbolInfo {
320                        name: relative.clone(),
321                        kind: SymbolKind::File,
322                        file_path: relative.clone(),
323                        provenance: SymbolProvenance::from_path(&relative),
324                        line: 0,
325                        column: 0,
326                        signature: format!(
327                            "{} ({} symbols)",
328                            file.file_name().to_string_lossy(),
329                            parsed.len()
330                        ),
331                        name_path: relative,
332                        id,
333                        body: None,
334                        children: parsed
335                            .into_iter()
336                            .map(|symbol| to_symbol_info(symbol, depth))
337                            .collect(),
338                        start_byte: 0,
339                        end_byte: 0,
340                        end_line: 0,
341                    });
342                }
343            }
344            return Ok(symbols);
345        }
346
347        let relative = self.project.to_relative(&resolved);
348        let parsed = self.ensure_indexed(&resolved, &relative)?;
349        Ok(parsed
350            .into_iter()
351            .map(|symbol| to_symbol_info(symbol, depth))
352            .collect())
353    }
354
355    pub fn find_symbol(
356        &self,
357        name: &str,
358        file_path: Option<&str>,
359        include_body: bool,
360        exact_match: bool,
361        max_matches: usize,
362    ) -> Result<Vec<SymbolInfo>> {
363        // Fast path: if name looks like a stable symbol ID, parse and do targeted lookup
364        if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
365            let resolved = self.project.resolve(id_file)?;
366            let relative = self.project.to_relative(&resolved);
367            self.ensure_indexed(&resolved, &relative)?;
368            // Extract the leaf name from name_path (after last '/')
369            let leaf_name = id_name_path.rsplit('/').next().unwrap_or(id_name_path);
370            let db = self.writer();
371            let db_rows = db.find_symbols_by_name(leaf_name, Some(id_file), true, max_matches)?;
372            let mut results = Vec::new();
373            for row in db_rows {
374                if row.name_path != id_name_path {
375                    continue;
376                }
377                let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
378                let body = if include_body {
379                    let abs = self.project.as_path().join(&rel_path);
380                    fs::read_to_string(&abs).ok().map(|source| {
381                        let extended_start =
382                            extend_start_to_doc_comments(&source, row.start_byte as u32);
383                        slice_source(&source, extended_start, row.end_byte as u32)
384                    })
385                } else {
386                    None
387                };
388                let kind = SymbolKind::from_str_label(&row.kind);
389                let id = make_symbol_id(&rel_path, &kind, &row.name_path);
390                let prov = SymbolProvenance::from_path(&rel_path);
391                results.push(SymbolInfo {
392                    name: row.name,
393                    kind,
394                    provenance: prov,
395                    file_path: rel_path,
396                    line: row.line as usize,
397                    column: row.column_num as usize,
398                    signature: row.signature,
399                    name_path: row.name_path,
400                    id,
401                    body,
402                    children: Vec::new(),
403                    start_byte: row.start_byte as u32,
404                    end_byte: row.end_byte as u32,
405                    end_line: if row.end_line > 0 {
406                        row.end_line as usize
407                    } else {
408                        row.line as usize
409                    },
410                });
411            }
412            return Ok(results);
413        }
414
415        // Ensure target files are indexed first
416        if let Some(fp) = file_path {
417            let resolved = self.project.resolve(fp)?;
418            let relative = self.project.to_relative(&resolved);
419            self.ensure_indexed(&resolved, &relative)?;
420        } else {
421            // Ensure all files are indexed for a global search
422            let files = collect_candidate_files(self.project.as_path())?;
423            for file in &files {
424                let relative = self.project.to_relative(file);
425                self.ensure_indexed(file, &relative)?;
426            }
427        }
428
429        let db = self.writer();
430        let db_rows = db.find_symbols_by_name(name, file_path, exact_match, max_matches)?;
431
432        let mut results = Vec::new();
433        for row in db_rows {
434            let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
435            let body = if include_body {
436                let abs = self.project.as_path().join(&rel_path);
437                fs::read_to_string(&abs).ok().map(|source| {
438                    let extended_start =
439                        extend_start_to_doc_comments(&source, row.start_byte as u32);
440                    slice_source(&source, extended_start, row.end_byte as u32)
441                })
442            } else {
443                None
444            };
445            let kind = SymbolKind::from_str_label(&row.kind);
446            let id = make_symbol_id(&rel_path, &kind, &row.name_path);
447            let prov = SymbolProvenance::from_path(&rel_path);
448            results.push(SymbolInfo {
449                name: row.name,
450                kind,
451                provenance: prov,
452                file_path: rel_path,
453                line: row.line as usize,
454                column: row.column_num as usize,
455                signature: row.signature,
456                name_path: row.name_path,
457                id,
458                body,
459                children: Vec::new(),
460                start_byte: row.start_byte as u32,
461                end_byte: row.end_byte as u32,
462                end_line: if row.end_line > 0 {
463                    row.end_line as usize
464                } else {
465                    row.line as usize
466                },
467            });
468        }
469        Ok(results)
470    }
471
472    pub fn get_ranked_context(
473        &self,
474        query: &str,
475        path: Option<&str>,
476        max_tokens: usize,
477        include_body: bool,
478        depth: usize,
479    ) -> Result<RankedContextResult> {
480        let all_symbols = if let Some(path) = path {
481            self.get_symbols_overview(path, depth)?
482        } else {
483            // SelectSolve: file pre-filtering → top files → symbol extraction
484            self.select_solve_symbols(query, depth)?
485        };
486
487        let mut scored = all_symbols
488            .into_iter()
489            .flat_map(flatten_symbol_infos)
490            .filter_map(|symbol| score_symbol(query, &symbol).map(|score| (symbol, score)))
491            .collect::<Vec<_>>();
492        scored.sort_by(|left, right| right.1.cmp(&left.1));
493
494        let (selected, chars_used, pruned_count, last_kept_score) =
495            prune_to_budget(scored, max_tokens, include_body, self.project.as_path());
496
497        Ok(RankedContextResult {
498            query: query.to_owned(),
499            count: selected.len(),
500            symbols: selected,
501            token_budget: max_tokens,
502            chars_used,
503            pruned_count,
504            last_kept_score,
505        })
506    }
507
508    /// Access the underlying database (e.g. for import graph queries).
509    pub fn db(&self) -> std::sync::MutexGuard<'_, IndexDb> {
510        self.writer()
511    }
512}
513
514pub fn get_symbols_overview(
515    project: &ProjectRoot,
516    path: &str,
517    depth: usize,
518) -> Result<Vec<SymbolInfo>> {
519    let resolved = project.resolve(path)?;
520    if resolved.is_dir() {
521        return get_directory_symbols(project, &resolved, depth);
522    }
523    get_file_symbols(project, &resolved, depth)
524}
525
526/// Find the byte range (start_byte, end_byte) of a named symbol in a file.
527/// If name_path is provided (e.g. "ClassName/method"), matches by full name_path;
528/// otherwise matches by symbol name alone.
529pub fn find_symbol_range(
530    project: &ProjectRoot,
531    relative_path: &str,
532    symbol_name: &str,
533    name_path: Option<&str>,
534) -> Result<(usize, usize)> {
535    let file = project.resolve(relative_path)?;
536    let rel = project.to_relative(&file);
537    let Some(language_config) = language_for_path(&file) else {
538        bail!("unsupported file type: {}", file.display());
539    };
540    let source =
541        fs::read_to_string(&file).with_context(|| format!("failed to read {}", file.display()))?;
542    let parsed = parse_symbols(&language_config, &rel, &source, false)?;
543    let flat = flatten_symbols(parsed);
544
545    let candidate = if let Some(np) = name_path {
546        flat.into_iter()
547            .find(|sym| sym.name_path == np || sym.name == symbol_name)
548    } else {
549        flat.into_iter().find(|sym| sym.name == symbol_name)
550    };
551
552    match candidate {
553        Some(sym) => Ok((sym.start_byte as usize, sym.end_byte as usize)),
554        None => bail!(
555            "symbol '{}' not found in {}",
556            name_path.unwrap_or(symbol_name),
557            relative_path
558        ),
559    }
560}
561
562pub fn find_symbol(
563    project: &ProjectRoot,
564    name: &str,
565    file_path: Option<&str>,
566    include_body: bool,
567    exact_match: bool,
568    max_matches: usize,
569) -> Result<Vec<SymbolInfo>> {
570    // Fast path: stable symbol ID
571    if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
572        let resolved = project.resolve(id_file)?;
573        let rel = project.to_relative(&resolved);
574        let Some(language_config) = language_for_path(&resolved) else {
575            return Ok(Vec::new());
576        };
577        let source = fs::read_to_string(&resolved)?;
578        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
579        let mut results = Vec::new();
580        for symbol in flatten_symbols(parsed) {
581            if symbol.name_path == id_name_path {
582                results.push(to_symbol_info(symbol, usize::MAX));
583                if results.len() >= max_matches {
584                    return Ok(results);
585                }
586            }
587        }
588        return Ok(results);
589    }
590
591    let files = match file_path {
592        Some(path) => vec![project.resolve(path)?],
593        None => collect_candidate_files(project.as_path())?,
594    };
595
596    let query = name.to_lowercase();
597    let mut results = Vec::new();
598
599    for file in files {
600        let rel = project.to_relative(&file);
601        let Some(language_config) = language_for_path(&file) else {
602            continue;
603        };
604        let source = match fs::read_to_string(&file) {
605            Ok(source) => source,
606            Err(_) => continue,
607        };
608        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
609        for symbol in flatten_symbols(parsed) {
610            let matched = if exact_match {
611                symbol.name == name
612            } else {
613                scoring::contains_ascii_ci(&symbol.name, &query)
614            };
615            if matched {
616                results.push(to_symbol_info(symbol, usize::MAX));
617                if results.len() >= max_matches {
618                    return Ok(results);
619                }
620            }
621        }
622    }
623
624    Ok(results)
625}
626
627fn get_directory_symbols(
628    project: &ProjectRoot,
629    dir: &Path,
630    depth: usize,
631) -> Result<Vec<SymbolInfo>> {
632    let mut symbols = Vec::new();
633    for entry in WalkDir::new(dir)
634        .into_iter()
635        .filter_entry(|entry| !is_excluded(entry.path()))
636    {
637        let entry = entry?;
638        if !entry.file_type().is_file() {
639            continue;
640        }
641        let path = entry.path();
642        if language_for_path(path).is_none() {
643            continue;
644        }
645        let file_symbols = get_file_symbols(project, path, depth)?;
646        if !file_symbols.is_empty() {
647            let relative = project.to_relative(path);
648            let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
649            symbols.push(SymbolInfo {
650                name: relative.clone(),
651                kind: SymbolKind::File,
652                file_path: relative.clone(),
653                provenance: SymbolProvenance::from_path(&relative),
654                line: 0,
655                column: 0,
656                signature: format!(
657                    "{} ({} symbols)",
658                    path.file_name()
659                        .and_then(|name| name.to_str())
660                        .unwrap_or_default(),
661                    file_symbols.len()
662                ),
663                name_path: relative,
664                id,
665                body: None,
666                children: file_symbols,
667                start_byte: 0,
668                end_byte: 0,
669                end_line: 0,
670            });
671        }
672    }
673    Ok(symbols)
674}
675
676fn get_file_symbols(project: &ProjectRoot, file: &Path, depth: usize) -> Result<Vec<SymbolInfo>> {
677    let relative = project.to_relative(file);
678    let Some(language_config) = language_for_path(file) else {
679        return Ok(Vec::new());
680    };
681    let source =
682        fs::read_to_string(file).with_context(|| format!("failed to read {}", file.display()))?;
683    let parsed = parse_symbols(&language_config, &relative, &source, false)?;
684    Ok(parsed
685        .into_iter()
686        .map(|symbol| to_symbol_info(symbol, depth))
687        .collect())
688}
689
690fn collect_candidate_files(root: &Path) -> Result<Vec<PathBuf>> {
691    collect_files(root, |path| language_for_path(path).is_some())
692}
693
694fn file_modified_ms(path: &Path) -> Result<u128> {
695    let modified = fs::metadata(path)
696        .with_context(|| format!("failed to stat {}", path.display()))?
697        .modified()
698        .with_context(|| format!("failed to read mtime for {}", path.display()))?;
699    Ok(modified
700        .duration_since(UNIX_EPOCH)
701        .unwrap_or_default()
702        .as_millis())
703}