Skip to main content

codelens_engine/symbols/
mod.rs

1mod parser;
2mod ranking;
3mod reader;
4pub mod scoring;
5#[cfg(test)]
6mod tests;
7mod types;
8mod writer;
9
10use parser::{flatten_symbol_infos, flatten_symbols, parse_symbols, slice_source, to_symbol_info};
11use ranking::prune_to_budget;
12use scoring::score_symbol;
13pub use scoring::{
14    sparse_coverage_bonus_from_fields, sparse_max_bonus, sparse_threshold, sparse_weighting_enabled,
15};
16pub(crate) use types::ReadDb;
17pub use types::{
18    IndexStats, RankedContextEntry, RankedContextResult, SymbolInfo, SymbolKind, SymbolProvenance,
19    make_symbol_id, parse_symbol_id,
20};
21
22use crate::db::{self, IndexDb, content_hash, index_db_path};
23// Re-export language_for_path so downstream crate modules keep working.
24pub(crate) use crate::lang_config::{LanguageConfig, language_for_path};
25use crate::project::ProjectRoot;
26use anyhow::{Context, Result, bail};
27use std::fs;
28use std::path::{Path, PathBuf};
29use std::time::UNIX_EPOCH;
30use walkdir::WalkDir;
31
32use crate::project::{collect_files, is_excluded};
33
34// Types (SymbolKind, SymbolInfo, ParsedSymbol, IndexStats, RankedContextEntry,
35// RankedContextResult, ReadDb) are in types.rs, re-exported above.
36
37/// SQLite-backed symbol index for a project.
38///
39/// Architecture: writer `Mutex<IndexDb>` for mutations + per-query read-only
40/// connections for `_cached` methods. This makes `SymbolIndex: Send + Sync`,
41/// enabling `Arc<SymbolIndex>` without an external Mutex.
42pub struct SymbolIndex {
43    project: ProjectRoot,
44    db_path: PathBuf,
45    writer: std::sync::Mutex<IndexDb>,
46    /// In-memory mode flag (tests) — when true, _cached reads use the writer.
47    in_memory: bool,
48}
49
50impl SymbolIndex {
51    pub fn new(project: ProjectRoot) -> Self {
52        let db_path = index_db_path(project.as_path());
53        let db = IndexDb::open(&db_path).unwrap_or_else(|e| {
54            tracing::warn!(
55                path = %db_path.display(),
56                error = %e,
57                "failed to open DB, falling back to in-memory"
58            );
59            IndexDb::open_memory().unwrap()
60        });
61        let in_memory = !db_path.is_file();
62        let mut idx = Self {
63            project,
64            db_path,
65            writer: std::sync::Mutex::new(db),
66            in_memory,
67        };
68        // Auto-migrate from legacy JSON index if DB is empty
69        if idx.writer().file_count().unwrap_or(0) == 0 {
70            let _ = idx.migrate_from_json();
71        }
72        idx
73    }
74
75    /// Acquire the writer connection (poison-safe).
76    fn writer(&self) -> std::sync::MutexGuard<'_, IndexDb> {
77        self.writer
78            .lock()
79            .unwrap_or_else(|poisoned| poisoned.into_inner())
80    }
81
82    /// Open a read-only DB connection for queries (or fall back to writer for in-memory).
83    fn reader(&self) -> Result<ReadDb<'_>> {
84        if self.in_memory {
85            return Ok(ReadDb::Writer(self.writer()));
86        }
87        match IndexDb::open_readonly(&self.db_path)? {
88            Some(db) => Ok(ReadDb::Owned(db)),
89            None => Ok(ReadDb::Writer(self.writer())),
90        }
91    }
92
93    /// Create an in-memory index (for tests and benchmarks — no disk persistence).
94    pub fn new_memory(project: ProjectRoot) -> Self {
95        let db = IndexDb::open_memory().unwrap();
96        Self {
97            db_path: PathBuf::new(),
98            project,
99            writer: std::sync::Mutex::new(db),
100            in_memory: true,
101        }
102    }
103
104    pub fn stats(&self) -> Result<IndexStats> {
105        let db = self.reader()?;
106        let supported_files = collect_candidate_files(self.project.as_path())?;
107        let indexed_files = db.file_count()?;
108        let indexed_paths = db.all_file_paths()?;
109
110        let mut stale = 0usize;
111        for rel in &indexed_paths {
112            let path = self.project.as_path().join(rel);
113            if !path.is_file() {
114                stale += 1;
115                continue;
116            }
117            let content = match fs::read(&path) {
118                Ok(c) => c,
119                Err(_) => {
120                    stale += 1;
121                    continue;
122                }
123            };
124            let hash = content_hash(&content);
125            let mtime = file_modified_ms(&path).unwrap_or(0) as i64;
126            if db.get_fresh_file(rel, mtime, &hash)?.is_none() {
127                stale += 1;
128            }
129        }
130
131        Ok(IndexStats {
132            indexed_files,
133            supported_files: supported_files.len(),
134            stale_files: stale,
135        })
136    }
137
138    /// SelectSolve file pre-filtering: score files by name relevance to query,
139    /// then extract symbols only from top-scoring files.
140    /// Path-first retrieval with FTS5 boost: file paths scored by query token
141    /// matching, then boosted by FTS5 symbol hits in the same file.
142    fn select_solve_symbols(&self, query: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
143        // Collect file paths and compute top matches inside a block so the
144        // MutexGuard (ReadDb::Writer) is dropped before we call find_symbol /
145        // get_symbols_overview_cached, which also need the lock.  Holding the
146        // guard across those calls causes a deadlock with in-memory DBs.
147        //
148        // FTS5 boost: search each query token as a symbol name via FTS5,
149        // collect which files contain matching symbols, and boost those files.
150        // Token-level search is critical for NL queries like "how does dispatch
151        // work" — the full query won't match any symbol, but "dispatch" will
152        // find dispatch_tool in dispatch/mod.rs.
153        let fts_file_boost: std::collections::HashSet<String> = {
154            let query_lower = query.to_ascii_lowercase();
155            let tokens: Vec<&str> = query_lower
156                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
157                .filter(|t| t.len() >= 3)
158                .collect();
159            let mut boost_files = std::collections::HashSet::new();
160            // First try full query (catches exact symbol names like "dispatch_tool")
161            if let Ok(hits) = self.find_symbol(query, None, false, false, 15) {
162                for sym in hits {
163                    boost_files.insert(sym.file_path);
164                }
165            }
166            // Then try individual tokens (catches NL queries)
167            for token in &tokens {
168                if let Ok(hits) = self.find_symbol(token, None, false, false, 10) {
169                    for sym in hits {
170                        boost_files.insert(sym.file_path);
171                    }
172                }
173            }
174            boost_files
175        };
176
177        let (top_files, importer_files): (Vec<String>, Vec<String>) = {
178            let db = self.reader()?;
179            let all_paths = db.all_file_paths()?;
180
181            let query_lower = query.to_ascii_lowercase();
182            let query_tokens: Vec<&str> = query_lower
183                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
184                .filter(|t| t.len() >= 3)
185                .collect();
186
187            let mut file_scores: Vec<(String, usize)> = all_paths
188                .into_iter()
189                .map(|path| {
190                    let path_lower = path.to_ascii_lowercase();
191                    let mut score = query_tokens
192                        .iter()
193                        .filter(|token| path_lower.contains(**token))
194                        .count();
195                    // FTS5 boost: files containing matching symbols get +2
196                    if fts_file_boost.contains(&path) {
197                        score += 2;
198                    }
199                    (path, score)
200                })
201                .collect();
202
203            file_scores.sort_by(|a, b| b.1.cmp(&a.1));
204            let top: Vec<String> = file_scores
205                .into_iter()
206                .filter(|(_, score)| *score > 0)
207                .take(10)
208                .map(|(path, _)| path)
209                .collect();
210
211            // Import graph proximity: files that import top-matched files
212            // provide structural context (callers, consumers of matched code).
213            let mut importers = Vec::new();
214            if !top.is_empty() && top.len() <= 5 {
215                for file_path in top.iter().take(3) {
216                    if let Ok(imp) = db.get_importers(file_path) {
217                        for importer_path in imp.into_iter().take(3) {
218                            importers.push(importer_path);
219                        }
220                    }
221                }
222            }
223
224            (top, importers)
225            // db (MutexGuard) dropped here
226        };
227
228        // If no file matches (path + FTS5 both empty), fall back to broad symbol search
229        if top_files.is_empty() {
230            return self.find_symbol(query, None, false, false, 500);
231        }
232
233        // Collect symbols from top files
234        let mut all_symbols = Vec::new();
235        for file_path in &top_files {
236            if let Ok(symbols) = self.get_symbols_overview_cached(file_path, depth) {
237                all_symbols.extend(symbols);
238            }
239        }
240
241        // Import graph proximity: include symbols from files that import top matches.
242        // These provide structural context (callers, consumers of matched code).
243        for importer_path in &importer_files {
244            if let Ok(symbols) = self.get_symbols_overview_cached(importer_path, 1) {
245                all_symbols.extend(symbols);
246            }
247        }
248
249        // Also include direct symbol name matches (for exact/substring hits)
250        let mut seen_ids: std::collections::HashSet<String> =
251            all_symbols.iter().map(|s| s.id.clone()).collect();
252
253        if let Ok(direct) = self.find_symbol(query, None, false, false, 50) {
254            for sym in direct {
255                if seen_ids.insert(sym.id.clone()) {
256                    all_symbols.push(sym);
257                }
258            }
259        }
260
261        // For multi-word queries, also search individual tokens as symbol names
262        // (e.g., "dispatch tool call" → search for "dispatch", "tool", "call")
263        let query_lower = query.to_ascii_lowercase();
264        let tokens: Vec<&str> = query_lower
265            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
266            .filter(|t| t.len() >= 3)
267            .collect();
268        if tokens.len() >= 2 {
269            for token in &tokens {
270                match self.find_symbol(token, None, false, false, 10) {
271                    Ok(hits) => {
272                        for sym in hits {
273                            if seen_ids.insert(sym.id.clone()) {
274                                all_symbols.push(sym);
275                            }
276                        }
277                    }
278                    Err(e) => {
279                        tracing::debug!(token, error = %e, "token find_symbol failed");
280                    }
281                }
282            }
283        }
284
285        Ok(all_symbols)
286    }
287
288    /// Hierarchical project structure: per-directory file count + symbol count.
289    /// Used as Level 1 pruning — lets LLM decide which directories to drill into.
290    pub fn get_project_structure(&self) -> Result<Vec<db::DirStats>> {
291        let db = self.reader()?;
292        db.dir_stats()
293    }
294
295    pub fn get_symbols_overview(&self, path: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
296        let resolved = self.project.resolve(path)?;
297        if resolved.is_dir() {
298            let mut symbols = Vec::new();
299            for file in WalkDir::new(&resolved)
300                .into_iter()
301                .filter_entry(|entry| !is_excluded(entry.path()))
302            {
303                let file = file?;
304                if !file.file_type().is_file() || language_for_path(file.path()).is_none() {
305                    continue;
306                }
307                let relative = self.project.to_relative(file.path());
308                let parsed = self.ensure_indexed(file.path(), &relative)?;
309                if !parsed.is_empty() {
310                    let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
311                    symbols.push(SymbolInfo {
312                        name: relative.clone(),
313                        kind: SymbolKind::File,
314                        file_path: relative.clone(),
315                        provenance: SymbolProvenance::from_path(&relative),
316                        line: 0,
317                        column: 0,
318                        signature: format!(
319                            "{} ({} symbols)",
320                            file.file_name().to_string_lossy(),
321                            parsed.len()
322                        ),
323                        name_path: relative,
324                        id,
325                        body: None,
326                        children: parsed
327                            .into_iter()
328                            .map(|symbol| to_symbol_info(symbol, depth))
329                            .collect(),
330                        start_byte: 0,
331                        end_byte: 0,
332                    });
333                }
334            }
335            return Ok(symbols);
336        }
337
338        let relative = self.project.to_relative(&resolved);
339        let parsed = self.ensure_indexed(&resolved, &relative)?;
340        Ok(parsed
341            .into_iter()
342            .map(|symbol| to_symbol_info(symbol, depth))
343            .collect())
344    }
345
346    pub fn find_symbol(
347        &self,
348        name: &str,
349        file_path: Option<&str>,
350        include_body: bool,
351        exact_match: bool,
352        max_matches: usize,
353    ) -> Result<Vec<SymbolInfo>> {
354        // Fast path: if name looks like a stable symbol ID, parse and do targeted lookup
355        if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
356            let resolved = self.project.resolve(id_file)?;
357            let relative = self.project.to_relative(&resolved);
358            self.ensure_indexed(&resolved, &relative)?;
359            // Extract the leaf name from name_path (after last '/')
360            let leaf_name = id_name_path.rsplit('/').next().unwrap_or(id_name_path);
361            let db = self.writer();
362            let db_rows = db.find_symbols_by_name(leaf_name, Some(id_file), true, max_matches)?;
363            let mut results = Vec::new();
364            for row in db_rows {
365                if row.name_path != id_name_path {
366                    continue;
367                }
368                let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
369                let body = if include_body {
370                    let abs = self.project.as_path().join(&rel_path);
371                    fs::read_to_string(&abs).ok().map(|source| {
372                        slice_source(&source, row.start_byte as u32, row.end_byte as u32)
373                    })
374                } else {
375                    None
376                };
377                let kind = SymbolKind::from_str_label(&row.kind);
378                let id = make_symbol_id(&rel_path, &kind, &row.name_path);
379                let prov = SymbolProvenance::from_path(&rel_path);
380                results.push(SymbolInfo {
381                    name: row.name,
382                    kind,
383                    provenance: prov,
384                    file_path: rel_path,
385                    line: row.line as usize,
386                    column: row.column_num as usize,
387                    signature: row.signature,
388                    name_path: row.name_path,
389                    id,
390                    body,
391                    children: Vec::new(),
392                    start_byte: row.start_byte as u32,
393                    end_byte: row.end_byte as u32,
394                });
395            }
396            return Ok(results);
397        }
398
399        // Ensure target files are indexed first
400        if let Some(fp) = file_path {
401            let resolved = self.project.resolve(fp)?;
402            let relative = self.project.to_relative(&resolved);
403            self.ensure_indexed(&resolved, &relative)?;
404        } else {
405            // Ensure all files are indexed for a global search
406            let files = collect_candidate_files(self.project.as_path())?;
407            for file in &files {
408                let relative = self.project.to_relative(file);
409                self.ensure_indexed(file, &relative)?;
410            }
411        }
412
413        let db = self.writer();
414        let db_rows = db.find_symbols_by_name(name, file_path, exact_match, max_matches)?;
415
416        let mut results = Vec::new();
417        for row in db_rows {
418            let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
419            let body = if include_body {
420                let abs = self.project.as_path().join(&rel_path);
421                fs::read_to_string(&abs)
422                    .ok()
423                    .map(|source| slice_source(&source, row.start_byte as u32, row.end_byte as u32))
424            } else {
425                None
426            };
427            let kind = SymbolKind::from_str_label(&row.kind);
428            let id = make_symbol_id(&rel_path, &kind, &row.name_path);
429            let prov = SymbolProvenance::from_path(&rel_path);
430            results.push(SymbolInfo {
431                name: row.name,
432                kind,
433                provenance: prov,
434                file_path: rel_path,
435                line: row.line as usize,
436                column: row.column_num as usize,
437                signature: row.signature,
438                name_path: row.name_path,
439                id,
440                body,
441                children: Vec::new(),
442                start_byte: row.start_byte as u32,
443                end_byte: row.end_byte as u32,
444            });
445        }
446        Ok(results)
447    }
448
449    pub fn get_ranked_context(
450        &self,
451        query: &str,
452        path: Option<&str>,
453        max_tokens: usize,
454        include_body: bool,
455        depth: usize,
456    ) -> Result<RankedContextResult> {
457        let all_symbols = if let Some(path) = path {
458            self.get_symbols_overview(path, depth)?
459        } else {
460            // SelectSolve: file pre-filtering → top files → symbol extraction
461            self.select_solve_symbols(query, depth)?
462        };
463
464        let mut scored = all_symbols
465            .into_iter()
466            .flat_map(flatten_symbol_infos)
467            .filter_map(|symbol| score_symbol(query, &symbol).map(|score| (symbol, score)))
468            .collect::<Vec<_>>();
469        scored.sort_by(|left, right| right.1.cmp(&left.1));
470
471        let (selected, chars_used) =
472            prune_to_budget(scored, max_tokens, include_body, self.project.as_path());
473
474        Ok(RankedContextResult {
475            query: query.to_owned(),
476            count: selected.len(),
477            symbols: selected,
478            token_budget: max_tokens,
479            chars_used,
480        })
481    }
482
483    /// Access the underlying database (e.g. for import graph queries).
484    pub fn db(&self) -> std::sync::MutexGuard<'_, IndexDb> {
485        self.writer()
486    }
487}
488
489pub fn get_symbols_overview(
490    project: &ProjectRoot,
491    path: &str,
492    depth: usize,
493) -> Result<Vec<SymbolInfo>> {
494    let resolved = project.resolve(path)?;
495    if resolved.is_dir() {
496        return get_directory_symbols(project, &resolved, depth);
497    }
498    get_file_symbols(project, &resolved, depth)
499}
500
501/// Find the byte range (start_byte, end_byte) of a named symbol in a file.
502/// If name_path is provided (e.g. "ClassName/method"), matches by full name_path;
503/// otherwise matches by symbol name alone.
504pub fn find_symbol_range(
505    project: &ProjectRoot,
506    relative_path: &str,
507    symbol_name: &str,
508    name_path: Option<&str>,
509) -> Result<(usize, usize)> {
510    let file = project.resolve(relative_path)?;
511    let rel = project.to_relative(&file);
512    let Some(language_config) = language_for_path(&file) else {
513        bail!("unsupported file type: {}", file.display());
514    };
515    let source =
516        fs::read_to_string(&file).with_context(|| format!("failed to read {}", file.display()))?;
517    let parsed = parse_symbols(&language_config, &rel, &source, false)?;
518    let flat = flatten_symbols(parsed);
519
520    let candidate = if let Some(np) = name_path {
521        flat.into_iter()
522            .find(|sym| sym.name_path == np || sym.name == symbol_name)
523    } else {
524        flat.into_iter().find(|sym| sym.name == symbol_name)
525    };
526
527    match candidate {
528        Some(sym) => Ok((sym.start_byte as usize, sym.end_byte as usize)),
529        None => bail!(
530            "symbol '{}' not found in {}",
531            name_path.unwrap_or(symbol_name),
532            relative_path
533        ),
534    }
535}
536
537pub fn find_symbol(
538    project: &ProjectRoot,
539    name: &str,
540    file_path: Option<&str>,
541    include_body: bool,
542    exact_match: bool,
543    max_matches: usize,
544) -> Result<Vec<SymbolInfo>> {
545    // Fast path: stable symbol ID
546    if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
547        let resolved = project.resolve(id_file)?;
548        let rel = project.to_relative(&resolved);
549        let Some(language_config) = language_for_path(&resolved) else {
550            return Ok(Vec::new());
551        };
552        let source = fs::read_to_string(&resolved)?;
553        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
554        let mut results = Vec::new();
555        for symbol in flatten_symbols(parsed) {
556            if symbol.name_path == id_name_path {
557                results.push(to_symbol_info(symbol, usize::MAX));
558                if results.len() >= max_matches {
559                    return Ok(results);
560                }
561            }
562        }
563        return Ok(results);
564    }
565
566    let files = match file_path {
567        Some(path) => vec![project.resolve(path)?],
568        None => collect_candidate_files(project.as_path())?,
569    };
570
571    let query = name.to_lowercase();
572    let mut results = Vec::new();
573
574    for file in files {
575        let rel = project.to_relative(&file);
576        let Some(language_config) = language_for_path(&file) else {
577            continue;
578        };
579        let source = match fs::read_to_string(&file) {
580            Ok(source) => source,
581            Err(_) => continue,
582        };
583        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
584        for symbol in flatten_symbols(parsed) {
585            let matched = if exact_match {
586                symbol.name == name
587            } else {
588                scoring::contains_ascii_ci(&symbol.name, &query)
589            };
590            if matched {
591                results.push(to_symbol_info(symbol, usize::MAX));
592                if results.len() >= max_matches {
593                    return Ok(results);
594                }
595            }
596        }
597    }
598
599    Ok(results)
600}
601
602fn get_directory_symbols(
603    project: &ProjectRoot,
604    dir: &Path,
605    depth: usize,
606) -> Result<Vec<SymbolInfo>> {
607    let mut symbols = Vec::new();
608    for entry in WalkDir::new(dir)
609        .into_iter()
610        .filter_entry(|entry| !is_excluded(entry.path()))
611    {
612        let entry = entry?;
613        if !entry.file_type().is_file() {
614            continue;
615        }
616        let path = entry.path();
617        if language_for_path(path).is_none() {
618            continue;
619        }
620        let file_symbols = get_file_symbols(project, path, depth)?;
621        if !file_symbols.is_empty() {
622            let relative = project.to_relative(path);
623            let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
624            symbols.push(SymbolInfo {
625                name: relative.clone(),
626                kind: SymbolKind::File,
627                file_path: relative.clone(),
628                provenance: SymbolProvenance::from_path(&relative),
629                line: 0,
630                column: 0,
631                signature: format!(
632                    "{} ({} symbols)",
633                    path.file_name()
634                        .and_then(|name| name.to_str())
635                        .unwrap_or_default(),
636                    file_symbols.len()
637                ),
638                name_path: relative,
639                id,
640                body: None,
641                children: file_symbols,
642                start_byte: 0,
643                end_byte: 0,
644            });
645        }
646    }
647    Ok(symbols)
648}
649
650fn get_file_symbols(project: &ProjectRoot, file: &Path, depth: usize) -> Result<Vec<SymbolInfo>> {
651    let relative = project.to_relative(file);
652    let Some(language_config) = language_for_path(file) else {
653        return Ok(Vec::new());
654    };
655    let source =
656        fs::read_to_string(file).with_context(|| format!("failed to read {}", file.display()))?;
657    let parsed = parse_symbols(&language_config, &relative, &source, false)?;
658    Ok(parsed
659        .into_iter()
660        .map(|symbol| to_symbol_info(symbol, depth))
661        .collect())
662}
663
664fn collect_candidate_files(root: &Path) -> Result<Vec<PathBuf>> {
665    collect_files(root, |path| language_for_path(path).is_some())
666}
667
668fn file_modified_ms(path: &Path) -> Result<u128> {
669    let modified = fs::metadata(path)
670        .with_context(|| format!("failed to stat {}", path.display()))?
671        .modified()
672        .with_context(|| format!("failed to read mtime for {}", path.display()))?;
673    Ok(modified
674        .duration_since(UNIX_EPOCH)
675        .unwrap_or_default()
676        .as_millis())
677}