Skip to main content

codelens_engine/symbols/
mod.rs

1mod parser;
2mod ranking;
3mod reader;
4pub mod scoring;
5#[cfg(test)]
6mod tests;
7mod types;
8mod writer;
9
10use parser::{flatten_symbol_infos, flatten_symbols, parse_symbols, slice_source, to_symbol_info};
11use ranking::prune_to_budget;
12use scoring::score_symbol;
13pub use scoring::{
14    sparse_coverage_bonus_from_fields, sparse_max_bonus, sparse_threshold, sparse_weighting_enabled,
15};
16pub(crate) use types::ReadDb;
17pub use types::{
18    make_symbol_id, parse_symbol_id, IndexStats, RankedContextEntry, RankedContextResult,
19    SymbolInfo, SymbolKind,
20};
21
22use crate::db::{self, content_hash, index_db_path, IndexDb};
23// Re-export language_for_path so downstream crate modules keep working.
24pub(crate) use crate::lang_config::{language_for_path, LanguageConfig};
25use crate::project::ProjectRoot;
26use anyhow::{bail, Context, Result};
27use std::fs;
28use std::path::{Path, PathBuf};
29use std::time::UNIX_EPOCH;
30use walkdir::WalkDir;
31
32use crate::project::{collect_files, is_excluded};
33
34// Types (SymbolKind, SymbolInfo, ParsedSymbol, IndexStats, RankedContextEntry,
35// RankedContextResult, ReadDb) are in types.rs, re-exported above.
36
37/// SQLite-backed symbol index for a project.
38///
39/// Architecture: writer `Mutex<IndexDb>` for mutations + per-query read-only
40/// connections for `_cached` methods. This makes `SymbolIndex: Send + Sync`,
41/// enabling `Arc<SymbolIndex>` without an external Mutex.
42pub struct SymbolIndex {
43    project: ProjectRoot,
44    db_path: PathBuf,
45    writer: std::sync::Mutex<IndexDb>,
46    /// In-memory mode flag (tests) — when true, _cached reads use the writer.
47    in_memory: bool,
48}
49
50impl SymbolIndex {
51    pub fn new(project: ProjectRoot) -> Self {
52        let db_path = index_db_path(project.as_path());
53        let db = IndexDb::open(&db_path).unwrap_or_else(|e| {
54            tracing::warn!(
55                path = %db_path.display(),
56                error = %e,
57                "failed to open DB, falling back to in-memory"
58            );
59            IndexDb::open_memory().unwrap()
60        });
61        let in_memory = !db_path.is_file();
62        let mut idx = Self {
63            project,
64            db_path,
65            writer: std::sync::Mutex::new(db),
66            in_memory,
67        };
68        // Auto-migrate from legacy JSON index if DB is empty
69        if idx.writer().file_count().unwrap_or(0) == 0 {
70            let _ = idx.migrate_from_json();
71        }
72        idx
73    }
74
75    /// Acquire the writer connection (poison-safe).
76    fn writer(&self) -> std::sync::MutexGuard<'_, IndexDb> {
77        self.writer
78            .lock()
79            .unwrap_or_else(|poisoned| poisoned.into_inner())
80    }
81
82    /// Open a read-only DB connection for queries (or fall back to writer for in-memory).
83    fn reader(&self) -> Result<ReadDb<'_>> {
84        if self.in_memory {
85            return Ok(ReadDb::Writer(self.writer()));
86        }
87        match IndexDb::open_readonly(&self.db_path)? {
88            Some(db) => Ok(ReadDb::Owned(db)),
89            None => Ok(ReadDb::Writer(self.writer())),
90        }
91    }
92
93    /// Create an in-memory index (for tests and benchmarks — no disk persistence).
94    pub fn new_memory(project: ProjectRoot) -> Self {
95        let db = IndexDb::open_memory().unwrap();
96        Self {
97            db_path: PathBuf::new(),
98            project,
99            writer: std::sync::Mutex::new(db),
100            in_memory: true,
101        }
102    }
103
104    pub fn stats(&self) -> Result<IndexStats> {
105        let db = self.reader()?;
106        let supported_files = collect_candidate_files(self.project.as_path())?;
107        let indexed_files = db.file_count()?;
108        let indexed_paths = db.all_file_paths()?;
109
110        let mut stale = 0usize;
111        for rel in &indexed_paths {
112            let path = self.project.as_path().join(rel);
113            if !path.is_file() {
114                stale += 1;
115                continue;
116            }
117            let content = match fs::read(&path) {
118                Ok(c) => c,
119                Err(_) => {
120                    stale += 1;
121                    continue;
122                }
123            };
124            let hash = content_hash(&content);
125            let mtime = file_modified_ms(&path).unwrap_or(0) as i64;
126            if db.get_fresh_file(rel, mtime, &hash)?.is_none() {
127                stale += 1;
128            }
129        }
130
131        Ok(IndexStats {
132            indexed_files,
133            supported_files: supported_files.len(),
134            stale_files: stale,
135        })
136    }
137
138    /// SelectSolve file pre-filtering: score files by name relevance to query,
139    /// then extract symbols only from top-scoring files.
140    /// Path-first retrieval with FTS5 boost: file paths scored by query token
141    /// matching, then boosted by FTS5 symbol hits in the same file.
142    fn select_solve_symbols(&self, query: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
143        // Collect file paths and compute top matches inside a block so the
144        // MutexGuard (ReadDb::Writer) is dropped before we call find_symbol /
145        // get_symbols_overview_cached, which also need the lock.  Holding the
146        // guard across those calls causes a deadlock with in-memory DBs.
147        //
148        // FTS5 boost: search each query token as a symbol name via FTS5,
149        // collect which files contain matching symbols, and boost those files.
150        // Token-level search is critical for NL queries like "how does dispatch
151        // work" — the full query won't match any symbol, but "dispatch" will
152        // find dispatch_tool in dispatch/mod.rs.
153        let fts_file_boost: std::collections::HashSet<String> = {
154            let query_lower = query.to_ascii_lowercase();
155            let tokens: Vec<&str> = query_lower
156                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
157                .filter(|t| t.len() >= 3)
158                .collect();
159            let mut boost_files = std::collections::HashSet::new();
160            // First try full query (catches exact symbol names like "dispatch_tool")
161            if let Ok(hits) = self.find_symbol(query, None, false, false, 15) {
162                for sym in hits {
163                    boost_files.insert(sym.file_path);
164                }
165            }
166            // Then try individual tokens (catches NL queries)
167            for token in &tokens {
168                if let Ok(hits) = self.find_symbol(token, None, false, false, 10) {
169                    for sym in hits {
170                        boost_files.insert(sym.file_path);
171                    }
172                }
173            }
174            boost_files
175        };
176
177        let (top_files, importer_files): (Vec<String>, Vec<String>) = {
178            let db = self.reader()?;
179            let all_paths = db.all_file_paths()?;
180
181            let query_lower = query.to_ascii_lowercase();
182            let query_tokens: Vec<&str> = query_lower
183                .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
184                .filter(|t| t.len() >= 3)
185                .collect();
186
187            let mut file_scores: Vec<(String, usize)> = all_paths
188                .into_iter()
189                .map(|path| {
190                    let path_lower = path.to_ascii_lowercase();
191                    let mut score = query_tokens
192                        .iter()
193                        .filter(|token| path_lower.contains(**token))
194                        .count();
195                    // FTS5 boost: files containing matching symbols get +2
196                    if fts_file_boost.contains(&path) {
197                        score += 2;
198                    }
199                    (path, score)
200                })
201                .collect();
202
203            file_scores.sort_by(|a, b| b.1.cmp(&a.1));
204            let top: Vec<String> = file_scores
205                .into_iter()
206                .filter(|(_, score)| *score > 0)
207                .take(10)
208                .map(|(path, _)| path)
209                .collect();
210
211            // Import graph proximity: files that import top-matched files
212            // provide structural context (callers, consumers of matched code).
213            let mut importers = Vec::new();
214            if !top.is_empty() && top.len() <= 5 {
215                for file_path in top.iter().take(3) {
216                    if let Ok(imp) = db.get_importers(file_path) {
217                        for importer_path in imp.into_iter().take(3) {
218                            importers.push(importer_path);
219                        }
220                    }
221                }
222            }
223
224            (top, importers)
225            // db (MutexGuard) dropped here
226        };
227
228        // If no file matches (path + FTS5 both empty), fall back to broad symbol search
229        if top_files.is_empty() {
230            return self.find_symbol(query, None, false, false, 500);
231        }
232
233        // Collect symbols from top files
234        let mut all_symbols = Vec::new();
235        for file_path in &top_files {
236            if let Ok(symbols) = self.get_symbols_overview_cached(file_path, depth) {
237                all_symbols.extend(symbols);
238            }
239        }
240
241        // Import graph proximity: include symbols from files that import top matches.
242        // These provide structural context (callers, consumers of matched code).
243        for importer_path in &importer_files {
244            if let Ok(symbols) = self.get_symbols_overview_cached(importer_path, 1) {
245                all_symbols.extend(symbols);
246            }
247        }
248
249        // Also include direct symbol name matches (for exact/substring hits)
250        let mut seen_ids: std::collections::HashSet<String> =
251            all_symbols.iter().map(|s| s.id.clone()).collect();
252
253        if let Ok(direct) = self.find_symbol(query, None, false, false, 50) {
254            for sym in direct {
255                if seen_ids.insert(sym.id.clone()) {
256                    all_symbols.push(sym);
257                }
258            }
259        }
260
261        // For multi-word queries, also search individual tokens as symbol names
262        // (e.g., "dispatch tool call" → search for "dispatch", "tool", "call")
263        let query_lower = query.to_ascii_lowercase();
264        let tokens: Vec<&str> = query_lower
265            .split(|c: char| c.is_whitespace() || c == '_' || c == '-')
266            .filter(|t| t.len() >= 3)
267            .collect();
268        if tokens.len() >= 2 {
269            for token in &tokens {
270                match self.find_symbol(token, None, false, false, 10) {
271                    Ok(hits) => {
272                        for sym in hits {
273                            if seen_ids.insert(sym.id.clone()) {
274                                all_symbols.push(sym);
275                            }
276                        }
277                    }
278                    Err(e) => {
279                        tracing::debug!(token, error = %e, "token find_symbol failed");
280                    }
281                }
282            }
283        }
284
285        Ok(all_symbols)
286    }
287
288    /// Hierarchical project structure: per-directory file count + symbol count.
289    /// Used as Level 1 pruning — lets LLM decide which directories to drill into.
290    pub fn get_project_structure(&self) -> Result<Vec<db::DirStats>> {
291        let db = self.reader()?;
292        db.dir_stats()
293    }
294
295    pub fn get_symbols_overview(&self, path: &str, depth: usize) -> Result<Vec<SymbolInfo>> {
296        let resolved = self.project.resolve(path)?;
297        if resolved.is_dir() {
298            let mut symbols = Vec::new();
299            for file in WalkDir::new(&resolved)
300                .into_iter()
301                .filter_entry(|entry| !is_excluded(entry.path()))
302            {
303                let file = file?;
304                if !file.file_type().is_file() || language_for_path(file.path()).is_none() {
305                    continue;
306                }
307                let relative = self.project.to_relative(file.path());
308                let parsed = self.ensure_indexed(file.path(), &relative)?;
309                if !parsed.is_empty() {
310                    let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
311                    symbols.push(SymbolInfo {
312                        name: relative.clone(),
313                        kind: SymbolKind::File,
314                        file_path: relative.clone(),
315                        line: 0,
316                        column: 0,
317                        signature: format!(
318                            "{} ({} symbols)",
319                            file.file_name().to_string_lossy(),
320                            parsed.len()
321                        ),
322                        name_path: relative,
323                        id,
324                        body: None,
325                        children: parsed
326                            .into_iter()
327                            .map(|symbol| to_symbol_info(symbol, depth))
328                            .collect(),
329                        start_byte: 0,
330                        end_byte: 0,
331                    });
332                }
333            }
334            return Ok(symbols);
335        }
336
337        let relative = self.project.to_relative(&resolved);
338        let parsed = self.ensure_indexed(&resolved, &relative)?;
339        Ok(parsed
340            .into_iter()
341            .map(|symbol| to_symbol_info(symbol, depth))
342            .collect())
343    }
344
345    pub fn find_symbol(
346        &self,
347        name: &str,
348        file_path: Option<&str>,
349        include_body: bool,
350        exact_match: bool,
351        max_matches: usize,
352    ) -> Result<Vec<SymbolInfo>> {
353        // Fast path: if name looks like a stable symbol ID, parse and do targeted lookup
354        if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
355            let resolved = self.project.resolve(id_file)?;
356            let relative = self.project.to_relative(&resolved);
357            self.ensure_indexed(&resolved, &relative)?;
358            // Extract the leaf name from name_path (after last '/')
359            let leaf_name = id_name_path.rsplit('/').next().unwrap_or(id_name_path);
360            let db = self.writer();
361            let db_rows = db.find_symbols_by_name(leaf_name, Some(id_file), true, max_matches)?;
362            let mut results = Vec::new();
363            for row in db_rows {
364                if row.name_path != id_name_path {
365                    continue;
366                }
367                let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
368                let body = if include_body {
369                    let abs = self.project.as_path().join(&rel_path);
370                    fs::read_to_string(&abs).ok().map(|source| {
371                        slice_source(&source, row.start_byte as u32, row.end_byte as u32)
372                    })
373                } else {
374                    None
375                };
376                let kind = SymbolKind::from_str_label(&row.kind);
377                let id = make_symbol_id(&rel_path, &kind, &row.name_path);
378                results.push(SymbolInfo {
379                    name: row.name,
380                    kind,
381                    file_path: rel_path,
382                    line: row.line as usize,
383                    column: row.column_num as usize,
384                    signature: row.signature,
385                    name_path: row.name_path,
386                    id,
387                    body,
388                    children: Vec::new(),
389                    start_byte: row.start_byte as u32,
390                    end_byte: row.end_byte as u32,
391                });
392            }
393            return Ok(results);
394        }
395
396        // Ensure target files are indexed first
397        if let Some(fp) = file_path {
398            let resolved = self.project.resolve(fp)?;
399            let relative = self.project.to_relative(&resolved);
400            self.ensure_indexed(&resolved, &relative)?;
401        } else {
402            // Ensure all files are indexed for a global search
403            let files = collect_candidate_files(self.project.as_path())?;
404            for file in &files {
405                let relative = self.project.to_relative(file);
406                self.ensure_indexed(file, &relative)?;
407            }
408        }
409
410        let db = self.writer();
411        let db_rows = db.find_symbols_by_name(name, file_path, exact_match, max_matches)?;
412
413        let mut results = Vec::new();
414        for row in db_rows {
415            let rel_path = db.get_file_path(row.file_id)?.unwrap_or_default();
416            let body = if include_body {
417                let abs = self.project.as_path().join(&rel_path);
418                fs::read_to_string(&abs)
419                    .ok()
420                    .map(|source| slice_source(&source, row.start_byte as u32, row.end_byte as u32))
421            } else {
422                None
423            };
424            let kind = SymbolKind::from_str_label(&row.kind);
425            let id = make_symbol_id(&rel_path, &kind, &row.name_path);
426            results.push(SymbolInfo {
427                name: row.name,
428                kind,
429                file_path: rel_path,
430                line: row.line as usize,
431                column: row.column_num as usize,
432                signature: row.signature,
433                name_path: row.name_path,
434                id,
435                body,
436                children: Vec::new(),
437                start_byte: row.start_byte as u32,
438                end_byte: row.end_byte as u32,
439            });
440        }
441        Ok(results)
442    }
443
444    pub fn get_ranked_context(
445        &self,
446        query: &str,
447        path: Option<&str>,
448        max_tokens: usize,
449        include_body: bool,
450        depth: usize,
451    ) -> Result<RankedContextResult> {
452        let all_symbols = if let Some(path) = path {
453            self.get_symbols_overview(path, depth)?
454        } else {
455            // SelectSolve: file pre-filtering → top files → symbol extraction
456            self.select_solve_symbols(query, depth)?
457        };
458
459        let mut scored = all_symbols
460            .into_iter()
461            .flat_map(flatten_symbol_infos)
462            .filter_map(|symbol| score_symbol(query, &symbol).map(|score| (symbol, score)))
463            .collect::<Vec<_>>();
464        scored.sort_by(|left, right| right.1.cmp(&left.1));
465
466        let (selected, chars_used) =
467            prune_to_budget(scored, max_tokens, include_body, self.project.as_path());
468
469        Ok(RankedContextResult {
470            query: query.to_owned(),
471            count: selected.len(),
472            symbols: selected,
473            token_budget: max_tokens,
474            chars_used,
475        })
476    }
477
478    /// Access the underlying database (e.g. for import graph queries).
479    pub fn db(&self) -> std::sync::MutexGuard<'_, IndexDb> {
480        self.writer()
481    }
482}
483
484pub fn get_symbols_overview(
485    project: &ProjectRoot,
486    path: &str,
487    depth: usize,
488) -> Result<Vec<SymbolInfo>> {
489    let resolved = project.resolve(path)?;
490    if resolved.is_dir() {
491        return get_directory_symbols(project, &resolved, depth);
492    }
493    get_file_symbols(project, &resolved, depth)
494}
495
496/// Find the byte range (start_byte, end_byte) of a named symbol in a file.
497/// If name_path is provided (e.g. "ClassName/method"), matches by full name_path;
498/// otherwise matches by symbol name alone.
499pub fn find_symbol_range(
500    project: &ProjectRoot,
501    relative_path: &str,
502    symbol_name: &str,
503    name_path: Option<&str>,
504) -> Result<(usize, usize)> {
505    let file = project.resolve(relative_path)?;
506    let rel = project.to_relative(&file);
507    let Some(language_config) = language_for_path(&file) else {
508        bail!("unsupported file type: {}", file.display());
509    };
510    let source =
511        fs::read_to_string(&file).with_context(|| format!("failed to read {}", file.display()))?;
512    let parsed = parse_symbols(&language_config, &rel, &source, false)?;
513    let flat = flatten_symbols(parsed);
514
515    let candidate = if let Some(np) = name_path {
516        flat.into_iter()
517            .find(|sym| sym.name_path == np || sym.name == symbol_name)
518    } else {
519        flat.into_iter().find(|sym| sym.name == symbol_name)
520    };
521
522    match candidate {
523        Some(sym) => Ok((sym.start_byte as usize, sym.end_byte as usize)),
524        None => bail!(
525            "symbol '{}' not found in {}",
526            name_path.unwrap_or(symbol_name),
527            relative_path
528        ),
529    }
530}
531
532pub fn find_symbol(
533    project: &ProjectRoot,
534    name: &str,
535    file_path: Option<&str>,
536    include_body: bool,
537    exact_match: bool,
538    max_matches: usize,
539) -> Result<Vec<SymbolInfo>> {
540    // Fast path: stable symbol ID
541    if let Some((id_file, _id_kind, id_name_path)) = parse_symbol_id(name) {
542        let resolved = project.resolve(id_file)?;
543        let rel = project.to_relative(&resolved);
544        let Some(language_config) = language_for_path(&resolved) else {
545            return Ok(Vec::new());
546        };
547        let source = fs::read_to_string(&resolved)?;
548        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
549        let mut results = Vec::new();
550        for symbol in flatten_symbols(parsed) {
551            if symbol.name_path == id_name_path {
552                results.push(to_symbol_info(symbol, usize::MAX));
553                if results.len() >= max_matches {
554                    return Ok(results);
555                }
556            }
557        }
558        return Ok(results);
559    }
560
561    let files = match file_path {
562        Some(path) => vec![project.resolve(path)?],
563        None => collect_candidate_files(project.as_path())?,
564    };
565
566    let query = name.to_lowercase();
567    let mut results = Vec::new();
568
569    for file in files {
570        let rel = project.to_relative(&file);
571        let Some(language_config) = language_for_path(&file) else {
572            continue;
573        };
574        let source = match fs::read_to_string(&file) {
575            Ok(source) => source,
576            Err(_) => continue,
577        };
578        let parsed = parse_symbols(&language_config, &rel, &source, include_body)?;
579        for symbol in flatten_symbols(parsed) {
580            let matched = if exact_match {
581                symbol.name == name
582            } else {
583                scoring::contains_ascii_ci(&symbol.name, &query)
584            };
585            if matched {
586                results.push(to_symbol_info(symbol, usize::MAX));
587                if results.len() >= max_matches {
588                    return Ok(results);
589                }
590            }
591        }
592    }
593
594    Ok(results)
595}
596
597fn get_directory_symbols(
598    project: &ProjectRoot,
599    dir: &Path,
600    depth: usize,
601) -> Result<Vec<SymbolInfo>> {
602    let mut symbols = Vec::new();
603    for entry in WalkDir::new(dir)
604        .into_iter()
605        .filter_entry(|entry| !is_excluded(entry.path()))
606    {
607        let entry = entry?;
608        if !entry.file_type().is_file() {
609            continue;
610        }
611        let path = entry.path();
612        if language_for_path(path).is_none() {
613            continue;
614        }
615        let file_symbols = get_file_symbols(project, path, depth)?;
616        if !file_symbols.is_empty() {
617            let relative = project.to_relative(path);
618            let id = make_symbol_id(&relative, &SymbolKind::File, &relative);
619            symbols.push(SymbolInfo {
620                name: relative.clone(),
621                kind: SymbolKind::File,
622                file_path: relative.clone(),
623                line: 0,
624                column: 0,
625                signature: format!(
626                    "{} ({} symbols)",
627                    path.file_name()
628                        .and_then(|name| name.to_str())
629                        .unwrap_or_default(),
630                    file_symbols.len()
631                ),
632                name_path: relative,
633                id,
634                body: None,
635                children: file_symbols,
636                start_byte: 0,
637                end_byte: 0,
638            });
639        }
640    }
641    Ok(symbols)
642}
643
644fn get_file_symbols(project: &ProjectRoot, file: &Path, depth: usize) -> Result<Vec<SymbolInfo>> {
645    let relative = project.to_relative(file);
646    let Some(language_config) = language_for_path(file) else {
647        return Ok(Vec::new());
648    };
649    let source =
650        fs::read_to_string(file).with_context(|| format!("failed to read {}", file.display()))?;
651    let parsed = parse_symbols(&language_config, &relative, &source, false)?;
652    Ok(parsed
653        .into_iter()
654        .map(|symbol| to_symbol_info(symbol, depth))
655        .collect())
656}
657
658fn collect_candidate_files(root: &Path) -> Result<Vec<PathBuf>> {
659    collect_files(root, |path| language_for_path(path).is_some())
660}
661
662fn file_modified_ms(path: &Path) -> Result<u128> {
663    let modified = fs::metadata(path)
664        .with_context(|| format!("failed to stat {}", path.display()))?
665        .modified()
666        .with_context(|| format!("failed to read mtime for {}", path.display()))?;
667    Ok(modified
668        .duration_since(UNIX_EPOCH)
669        .unwrap_or_default()
670        .as_millis())
671}