use std::cmp::Ordering;
use std::collections::{HashMap, HashSet, VecDeque};
use std::path::Path;
use anyhow::{Context, Result};
use rusqlite::{params, Connection, OptionalExtension};
use serde_json::json;
use crate::model::{
CloneHotspot, CloneMatch, DependencyPath, Entity, FileExtraction, PathHop, ReferenceLocation,
RelatedEdge, SelectorSuggestion, SliceResult, SymbolLocation, TopFileSummary,
};
pub struct GraphStore {
conn: Connection,
}
#[derive(Debug, Clone)]
pub struct UpsertOutcome {
pub updated: usize,
pub removed: usize,
pub skipped: usize,
}
impl UpsertOutcome {
pub fn new() -> Self {
Self {
updated: 0,
removed: 0,
skipped: 0,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortOrder {
ScoreDesc,
LineAsc,
LineDesc,
}
#[derive(Debug, Clone)]
pub struct ReferenceQueryOptions {
pub edge_type_filter: Option<String>,
pub file_glob: Option<String>,
pub language: Option<String>,
pub max_age_hours: Option<u64>,
pub limit: usize,
pub offset: usize,
pub dedup: bool,
pub order: SortOrder,
}
impl Default for ReferenceQueryOptions {
fn default() -> Self {
Self {
edge_type_filter: None,
file_glob: None,
language: None,
max_age_hours: None,
limit: 200,
offset: 0,
dedup: true,
order: SortOrder::ScoreDesc,
}
}
}
#[derive(Debug, Clone)]
pub struct SliceQueryOptions {
pub max_neighbors: usize,
pub dedup: bool,
pub suppress_low_signal_repeats: bool,
pub low_signal_name_cap: usize,
pub prefer_project_symbols: bool,
}
impl Default for SliceQueryOptions {
fn default() -> Self {
Self {
max_neighbors: 40,
dedup: true,
suppress_low_signal_repeats: true,
low_signal_name_cap: 1,
prefer_project_symbols: true,
}
}
}
#[derive(Debug, Clone)]
pub struct CloneQueryOptions {
pub min_similarity: f64,
pub limit: usize,
pub offset: usize,
}
impl Default for CloneQueryOptions {
fn default() -> Self {
Self {
min_similarity: 0.02,
limit: 50,
offset: 0,
}
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct PaginationInfo {
pub total: usize,
pub offset: usize,
pub limit: usize,
pub returned: usize,
pub has_more: bool,
pub next_offset: Option<usize>,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct CloneAnalysis {
pub self_fingerprint_count: i64,
pub candidate_files: usize,
pub surviving_candidates: usize,
pub filtered_by_threshold: usize,
#[serde(skip_serializing_if = "Option::is_none")]
pub max_candidate_similarity: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub suggested_min_similarity: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub empty_reason: Option<String>,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct FreshnessInfo {
pub file_count: i64,
#[serde(skip_serializing_if = "Option::is_none")]
pub latest_indexed_at: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub schema_version: Option<String>,
pub stale_after_hours: u64,
pub is_stale: bool,
}
#[derive(Debug, Clone, Default)]
pub struct SelectorSuggestOptions {
pub query: Option<String>,
pub file_glob: Option<String>,
pub entity_type: Option<String>,
pub limit: usize,
pub fuzzy: bool,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct SelectorResolution {
pub parsed_as: String,
pub matched: usize,
pub selected_key: Option<String>,
}
#[derive(Debug, Clone)]
struct SelectorLookup {
parsed_as: String,
candidates: Vec<Entity>,
entity: Option<Entity>,
}
impl GraphStore {
pub fn open(db_path: &Path) -> Result<Self> {
let conn = Connection::open(db_path)
.with_context(|| format!("failed to open sqlite db at {}", db_path.display()))?;
conn.execute_batch(
"
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS files (
path TEXT PRIMARY KEY,
lang TEXT NOT NULL,
content_hash TEXT NOT NULL,
size_bytes INTEGER NOT NULL,
indexed_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS entities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
entity_type TEXT NOT NULL,
key TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
lang TEXT,
file_path TEXT,
line INTEGER,
col INTEGER,
end_line INTEGER,
end_col INTEGER,
meta_json TEXT
);
CREATE TABLE IF NOT EXISTS edges (
id INTEGER PRIMARY KEY AUTOINCREMENT,
src_entity_id INTEGER NOT NULL,
dst_entity_id INTEGER NOT NULL,
edge_type TEXT NOT NULL,
file_path TEXT,
line INTEGER,
col INTEGER,
meta_json TEXT,
FOREIGN KEY(src_entity_id) REFERENCES entities(id) ON DELETE CASCADE,
FOREIGN KEY(dst_entity_id) REFERENCES entities(id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS fingerprints (
file_path TEXT NOT NULL,
fp_hash INTEGER NOT NULL,
span_start INTEGER NOT NULL,
span_end INTEGER NOT NULL
);
CREATE TABLE IF NOT EXISTS meta (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(content_hash);
CREATE INDEX IF NOT EXISTS idx_entities_name_type ON entities(name, entity_type);
CREATE INDEX IF NOT EXISTS idx_entities_file_type ON entities(file_path, entity_type);
CREATE INDEX IF NOT EXISTS idx_edges_src_type ON edges(src_entity_id, edge_type);
CREATE INDEX IF NOT EXISTS idx_edges_dst_type ON edges(dst_entity_id, edge_type);
CREATE INDEX IF NOT EXISTS idx_edges_file ON edges(file_path);
CREATE INDEX IF NOT EXISTS idx_fingerprints_hash ON fingerprints(fp_hash, file_path);
CREATE INDEX IF NOT EXISTS idx_fingerprints_file ON fingerprints(file_path);
",
)?;
conn.execute(
"INSERT INTO meta(key, value) VALUES('schema_version', '1')
ON CONFLICT(key) DO UPDATE SET value=excluded.value",
[],
)?;
Ok(Self { conn })
}
pub fn tracked_file_hash(&self, path: &str) -> Result<Option<String>> {
let hash = self
.conn
.query_row(
"SELECT content_hash FROM files WHERE path = ?1",
[path],
|row| row.get(0),
)
.optional()?;
Ok(hash)
}
pub fn tracked_files(&self) -> Result<HashSet<String>> {
let mut stmt = self.conn.prepare("SELECT path FROM files")?;
let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
let mut out = HashSet::new();
for row in rows {
out.insert(row?);
}
Ok(out)
}
pub fn remove_files(
&mut self,
removed_paths: &[String],
outcome: &mut UpsertOutcome,
) -> Result<()> {
let tx = self.conn.transaction()?;
for file_path in removed_paths {
tx.execute("DELETE FROM fingerprints WHERE file_path = ?1", [file_path])?;
tx.execute("DELETE FROM edges WHERE file_path = ?1", [file_path])?;
tx.execute(
"DELETE FROM entities WHERE file_path = ?1 OR key = ?2",
params![file_path, file_key(file_path)],
)?;
tx.execute("DELETE FROM files WHERE path = ?1", [file_path])?;
outcome.removed += 1;
}
tx.commit()?;
self.cleanup_orphan_nodes()?;
Ok(())
}
pub fn index_file(
&mut self,
file_path: &str,
language: &str,
content_hash: &str,
size_bytes: u64,
extraction: &FileExtraction,
fingerprints: &[(i64, i64, i64)],
resolved_imports: &[(String, String)],
outcome: &mut UpsertOutcome,
) -> Result<()> {
let tx = self.conn.transaction()?;
tx.execute("DELETE FROM fingerprints WHERE file_path = ?1", [file_path])?;
tx.execute("DELETE FROM edges WHERE file_path = ?1", [file_path])?;
tx.execute(
"DELETE FROM entities WHERE file_path = ?1 AND entity_type != 'file'",
[file_path],
)?;
tx.execute(
"INSERT INTO files(path, lang, content_hash, size_bytes, indexed_at)
VALUES(?1, ?2, ?3, ?4, datetime('now'))
ON CONFLICT(path) DO UPDATE SET
lang=excluded.lang,
content_hash=excluded.content_hash,
size_bytes=excluded.size_bytes,
indexed_at=excluded.indexed_at",
params![file_path, language, content_hash, size_bytes as i64],
)?;
let file_entity_id = ensure_entity_with_tx(
&tx,
"file",
&file_key(file_path),
file_path,
Some(language),
Some(file_path),
None,
None,
None,
None,
Some(json!({"kind": "source"}).to_string()),
)?;
let mut symbol_name_entities: HashMap<String, i64> = HashMap::new();
for definition in &extraction.definitions {
let symbol_key = format!(
"symbol:{}:{}:{}:{}:{}",
file_path, definition.qualname, definition.kind, definition.line, definition.col
);
let symbol_meta = json!({
"qualname": definition.qualname,
"kind": definition.kind,
"is_definition": true,
})
.to_string();
let symbol_entity_id = ensure_entity_with_tx(
&tx,
"symbol",
&symbol_key,
&definition.name,
Some(language),
Some(file_path),
Some(definition.line),
Some(definition.col),
Some(definition.end_line),
Some(definition.end_col),
Some(symbol_meta),
)?;
insert_edge_with_tx(
&tx,
file_entity_id,
symbol_entity_id,
"defines",
Some(file_path),
Some(definition.line),
Some(definition.col),
None,
)?;
let name_entity_id = if let Some(existing) = symbol_name_entities.get(&definition.name)
{
*existing
} else {
let key = symbol_name_key(language, &definition.name);
let entity_id = ensure_entity_with_tx(
&tx,
"symbol_name",
&key,
&definition.name,
Some(language),
None,
None,
None,
None,
None,
None,
)?;
symbol_name_entities.insert(definition.name.clone(), entity_id);
entity_id
};
insert_edge_with_tx(
&tx,
symbol_entity_id,
name_entity_id,
"names",
Some(file_path),
Some(definition.line),
Some(definition.col),
None,
)?;
}
for reference in &extraction.references {
let name_entity_id = if let Some(existing) = symbol_name_entities.get(&reference.name) {
*existing
} else {
let key = symbol_name_key(language, &reference.name);
let entity_id = ensure_entity_with_tx(
&tx,
"symbol_name",
&key,
&reference.name,
Some(language),
None,
None,
None,
None,
None,
None,
)?;
symbol_name_entities.insert(reference.name.clone(), entity_id);
entity_id
};
let meta = json!({
"end_line": reference.end_line,
"end_col": reference.end_col
})
.to_string();
insert_edge_with_tx(
&tx,
file_entity_id,
name_entity_id,
reference.kind.as_edge_type(),
Some(file_path),
Some(reference.line),
Some(reference.col),
Some(meta),
)?;
}
for import_item in &extraction.imports {
let module_entity_id = ensure_entity_with_tx(
&tx,
"module",
&module_key(language, &import_item.module),
&import_item.module,
Some(language),
None,
None,
None,
None,
None,
None,
)?;
insert_edge_with_tx(
&tx,
file_entity_id,
module_entity_id,
"imports",
Some(file_path),
Some(import_item.line),
Some(import_item.col),
None,
)?;
}
for (module_name, resolved_file) in resolved_imports {
let module_entity_id = ensure_entity_with_tx(
&tx,
"module",
&module_key(language, module_name),
module_name,
Some(language),
None,
None,
None,
None,
None,
None,
)?;
let resolved_file_id = ensure_entity_with_tx(
&tx,
"file",
&file_key(resolved_file),
resolved_file,
Some(language),
Some(resolved_file),
None,
None,
None,
None,
Some(json!({"kind": "source"}).to_string()),
)?;
insert_edge_with_tx(
&tx,
module_entity_id,
resolved_file_id,
"resolves_to",
Some(file_path),
None,
None,
None,
)?;
insert_edge_with_tx(
&tx,
file_entity_id,
resolved_file_id,
"depends_on",
Some(file_path),
None,
None,
Some(json!({"via": module_name}).to_string()),
)?;
}
let config_or_entry = classify_special_file(file_path);
if let Some(entity_type) = config_or_entry {
let special_id = ensure_entity_with_tx(
&tx,
entity_type,
&format!("{}:{}", entity_type, file_path),
file_path,
Some(language),
Some(file_path),
None,
None,
None,
None,
None,
)?;
insert_edge_with_tx(
&tx,
file_entity_id,
special_id,
"contains",
Some(file_path),
None,
None,
None,
)?;
}
for (fp_hash, span_start, span_end) in fingerprints {
tx.execute(
"INSERT INTO fingerprints(file_path, fp_hash, span_start, span_end)
VALUES(?1, ?2, ?3, ?4)",
params![file_path, fp_hash, span_start, span_end],
)?;
}
tx.commit()?;
self.cleanup_orphan_nodes()?;
outcome.updated += 1;
Ok(())
}
pub fn symbol_definitions(&self, symbol_name: &str) -> Result<Vec<SymbolLocation>> {
let mut stmt = self.conn.prepare(
"
SELECT s.name, s.file_path, s.line, s.col, s.end_line, s.end_col,
json_extract(s.meta_json, '$.kind') as kind,
json_extract(s.meta_json, '$.qualname') as qualname
FROM entities sn
JOIN edges en ON en.dst_entity_id = sn.id AND en.edge_type = 'names'
JOIN entities s ON s.id = en.src_entity_id AND s.entity_type = 'symbol'
WHERE sn.entity_type = 'symbol_name' AND sn.name = ?1
ORDER BY s.file_path, s.line
",
)?;
let rows = stmt.query_map([symbol_name], |row| {
Ok(SymbolLocation {
symbol_name: row.get(0)?,
file_path: row.get::<_, Option<String>>(1)?.unwrap_or_default(),
line: row.get::<_, Option<i64>>(2)?.unwrap_or_default(),
col: row.get::<_, Option<i64>>(3)?.unwrap_or_default(),
end_line: row.get(4)?,
end_col: row.get(5)?,
kind: row
.get::<_, Option<String>>(6)?
.unwrap_or_else(|| "unknown".to_string()),
qualname: row
.get::<_, Option<String>>(7)?
.unwrap_or_else(|| symbol_name.to_string()),
})
})?;
rows.collect::<std::result::Result<Vec<_>, _>>()
.map_err(Into::into)
}
pub fn symbol_references_page(
&self,
symbol_name: &str,
options: &ReferenceQueryOptions,
) -> Result<(Vec<ReferenceLocation>, PaginationInfo)> {
let mut out = self.symbol_references_unpaged(symbol_name, options)?;
let total = out.len();
let start = options.offset.min(total);
let end = start.saturating_add(options.limit).min(total);
let rows = out.drain(start..end).collect::<Vec<_>>();
let pagination = build_pagination(total, options.offset, options.limit, rows.len());
Ok((rows, pagination))
}
fn symbol_references_unpaged(
&self,
symbol_name: &str,
options: &ReferenceQueryOptions,
) -> Result<Vec<ReferenceLocation>> {
let mut params: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
let mut where_clauses = vec![
"sn.entity_type = 'symbol_name'".to_string(),
"sn.name = ?".to_string(),
];
params.push(Box::new(symbol_name.to_string()));
match options.edge_type_filter.as_deref() {
Some(edge_type) => {
where_clauses.push("e.edge_type = ?".to_string());
params.push(Box::new(edge_type.to_string()));
}
None => where_clauses.push("e.edge_type IN ('references', 'calls')".to_string()),
}
if let Some(glob) = options.file_glob.as_deref() {
where_clauses.push("e.file_path GLOB ?".to_string());
params.push(Box::new(glob.replace('\\', "/")));
}
if let Some(language) = options.language.as_deref() {
where_clauses.push("f.lang = ?".to_string());
params.push(Box::new(language.to_string()));
}
if let Some(max_age_hours) = options.max_age_hours {
where_clauses.push("f.indexed_at >= datetime('now', ?)".to_string());
params.push(Box::new(format!("-{max_age_hours} hours")));
}
let sql = format!(
"
SELECT sn.name, e.file_path, e.line, e.col, e.edge_type
FROM entities sn
JOIN edges e ON e.dst_entity_id = sn.id
LEFT JOIN files f ON f.path = e.file_path
WHERE {}
ORDER BY e.file_path ASC, e.line ASC, e.col ASC
",
where_clauses.join(" AND ")
);
let mut stmt = self.conn.prepare(&sql)?;
let bind_params = rusqlite::params_from_iter(params.iter().map(|p| &**p));
let rows = stmt.query_map(bind_params, |row| {
Ok(ReferenceLocation {
symbol_name: row.get(0)?,
file_path: row.get::<_, Option<String>>(1)?.unwrap_or_default(),
line: row.get::<_, Option<i64>>(2)?.unwrap_or_default(),
col: row.get::<_, Option<i64>>(3)?.unwrap_or_default(),
edge_type: row.get(4)?,
score: None,
why: None,
})
})?;
let mut out = Vec::new();
for row in rows {
out.push(row?);
}
if options.dedup {
let mut seen = HashSet::new();
out.retain(|item| {
seen.insert((
item.file_path.clone(),
item.line,
item.col,
item.edge_type.clone(),
))
});
}
let def_files = self.definition_files_for_symbol(symbol_name)?;
for item in &mut out {
let mut score = if item.edge_type == "calls" { 2.0 } else { 1.0 };
let mut why = vec![format!("edge_type={}", item.edge_type)];
if def_files.contains(&item.file_path) {
score += 0.35;
why.push("same_file_as_definition".to_string());
}
item.score = Some(score);
item.why = Some(why.join(","));
}
out.sort_by(reference_sorter(options.order));
Ok(out)
}
pub fn dependency_path(
&self,
from_selector: &str,
to_selector: &str,
max_depth: usize,
) -> Result<DependencyPath> {
let from_resolution = self.resolve_selector(from_selector)?;
let to_resolution = self.resolve_selector(to_selector)?;
let Some(from) = from_resolution.entity else {
return Ok(DependencyPath {
found: false,
hops: Vec::new(),
});
};
let Some(to) = to_resolution.entity else {
return Ok(DependencyPath {
found: false,
hops: Vec::new(),
});
};
if from.id == to.id {
return Ok(DependencyPath {
found: true,
hops: vec![PathHop {
entity_key: from.key,
entity_name: from.name,
entity_type: from.entity_type,
}],
});
}
let mut queue: VecDeque<(i64, usize)> = VecDeque::new();
let mut seen: HashSet<i64> = HashSet::new();
let mut prev: HashMap<i64, i64> = HashMap::new();
queue.push_back((from.id, 0));
seen.insert(from.id);
while let Some((current, depth)) = queue.pop_front() {
if depth >= max_depth {
continue;
}
for neighbor in self.outgoing_neighbors(current)? {
if seen.insert(neighbor) {
prev.insert(neighbor, current);
if neighbor == to.id {
let mut chain = vec![to.id];
let mut cursor = to.id;
while let Some(parent) = prev.get(&cursor) {
chain.push(*parent);
if *parent == from.id {
break;
}
cursor = *parent;
}
chain.reverse();
let mut hops = Vec::with_capacity(chain.len());
for entity_id in chain {
let entity = self.entity_by_id(entity_id)?;
hops.push(PathHop {
entity_key: entity.key,
entity_name: entity.name,
entity_type: entity.entity_type,
});
}
return Ok(DependencyPath { found: true, hops });
}
queue.push_back((neighbor, depth + 1));
}
}
}
Ok(DependencyPath {
found: false,
hops: Vec::new(),
})
}
pub fn dependency_path_with_diagnostics(
&self,
from_selector: &str,
to_selector: &str,
max_depth: usize,
) -> Result<(DependencyPath, SelectorResolution, SelectorResolution)> {
let from_resolution = self.resolve_selector(from_selector)?;
let to_resolution = self.resolve_selector(to_selector)?;
let from_diag = SelectorResolution {
parsed_as: from_resolution.parsed_as.clone(),
matched: from_resolution.candidates.len(),
selected_key: from_resolution.entity.as_ref().map(|item| item.key.clone()),
};
let to_diag = SelectorResolution {
parsed_as: to_resolution.parsed_as.clone(),
matched: to_resolution.candidates.len(),
selected_key: to_resolution.entity.as_ref().map(|item| item.key.clone()),
};
let path = self.dependency_path(from_selector, to_selector, max_depth)?;
Ok((path, from_diag, to_diag))
}
pub fn minimal_slice_with_options(
&self,
file_path: &str,
line: Option<i64>,
depth: usize,
options: &SliceQueryOptions,
) -> Result<Option<SliceResult>> {
let anchor = if let Some(line_no) = line {
self.anchor_symbol_for_line(file_path, line_no)?
.or_else(|| self.find_entity_by_key(&file_key(file_path)).ok().flatten())
} else {
self.find_entity_by_key(&file_key(file_path))?
};
let Some(anchor) = anchor else {
return Ok(None);
};
let mut neighbors = Vec::new();
let mut frontier = vec![(anchor.id, 0usize)];
let mut seen: HashSet<i64> = HashSet::new();
seen.insert(anchor.id);
let mut seen_edges: HashSet<(String, String, i64, String)> = HashSet::new();
for _ in 0..depth.max(1) {
let mut next = Vec::new();
for (node_id, level) in frontier {
for mut related in self.neighbor_edges(node_id)? {
if seen.insert(related.entity.id) {
next.push((related.entity.id, level + 1));
}
if options.dedup
&& !seen_edges.insert((
related.direction.clone(),
related.edge_type.clone(),
related.entity.id,
related.entity.key.clone(),
))
{
continue;
}
let score =
score_related_edge(&related, level + 1, options.prefer_project_symbols);
related.depth = Some((level + 1) as i64);
related.score = Some(score);
related.why = Some(format!(
"edge_type={},direction={},depth={}",
related.edge_type,
related.direction,
level + 1
));
neighbors.push(related);
}
}
if next.is_empty() {
break;
}
frontier = next;
if options.max_neighbors > 0 && neighbors.len() >= options.max_neighbors {
break;
}
}
if options.max_neighbors > 0 {
neighbors.sort_by(related_edge_sorter);
if neighbors.len() > options.max_neighbors {
neighbors.truncate(options.max_neighbors);
}
}
if options.suppress_low_signal_repeats {
let cap = options.low_signal_name_cap.max(1);
let mut seen_symbol_names: HashMap<String, usize> = HashMap::new();
neighbors.retain(|edge| {
if edge.entity.entity_type != "symbol_name" {
return true;
}
let per_name_cap = cap;
let count = seen_symbol_names
.entry(edge.entity.name.clone())
.or_insert(0);
if *count >= per_name_cap {
return false;
}
*count += 1;
true
});
}
Ok(Some(SliceResult { anchor, neighbors }))
}
pub fn clone_matches_with_options(
&self,
file_path: &str,
options: &CloneQueryOptions,
) -> Result<Vec<CloneMatch>> {
let (rows, _, _) = self.clone_matches_page(file_path, options)?;
Ok(rows)
}
pub fn clone_matches_page(
&self,
file_path: &str,
options: &CloneQueryOptions,
) -> Result<(Vec<CloneMatch>, PaginationInfo, CloneAnalysis)> {
let self_count: i64 = self.conn.query_row(
"SELECT COUNT(DISTINCT fp_hash) FROM fingerprints WHERE file_path = ?1",
[file_path],
|row| row.get(0),
)?;
if self_count == 0 {
let pagination = build_pagination(0, options.offset, options.limit, 0);
let analysis = CloneAnalysis {
self_fingerprint_count: 0,
candidate_files: 0,
surviving_candidates: 0,
filtered_by_threshold: 0,
max_candidate_similarity: None,
suggested_min_similarity: Some(0.0),
empty_reason: Some(
"source file has no fingerprints; file may be too small or not yet indexed"
.to_string(),
),
};
return Ok((Vec::new(), pagination, analysis));
}
let mut shared_stmt = self.conn.prepare(
"
SELECT f2.file_path, COUNT(DISTINCT f1.fp_hash) AS shared_count
FROM fingerprints f1
JOIN fingerprints f2 ON f1.fp_hash = f2.fp_hash
WHERE f1.file_path = ?1
AND f2.file_path != ?1
GROUP BY f2.file_path
ORDER BY shared_count DESC
",
)?;
let shared_rows = shared_stmt.query_map([file_path], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
})?;
let mut counts_stmt = self.conn.prepare(
"SELECT file_path, COUNT(DISTINCT fp_hash) FROM fingerprints GROUP BY file_path",
)?;
let counts_rows = counts_stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
})?;
let mut totals: HashMap<String, i64> = HashMap::new();
for row in counts_rows {
let (path, cnt) = row?;
totals.insert(path, cnt);
}
let mut all_candidates = Vec::new();
for row in shared_rows {
let (other_file, shared_count) = row?;
let other_total = totals.get(&other_file).copied().unwrap_or(1);
let denom = self_count.max(other_total) as f64;
let similarity = shared_count as f64 / denom;
all_candidates.push(CloneMatch {
other_file,
shared_fingerprints: shared_count,
similarity,
});
}
let candidate_files = all_candidates.len();
let max_candidate_similarity = all_candidates
.iter()
.map(|item| item.similarity)
.max_by(|left, right| left.partial_cmp(right).unwrap_or(Ordering::Equal));
let mut surviving = all_candidates
.into_iter()
.filter(|row| row.similarity >= options.min_similarity)
.collect::<Vec<_>>();
let surviving_count = surviving.len();
let filtered_by_threshold = candidate_files.saturating_sub(surviving_count);
surviving.sort_by(|left, right| {
right
.similarity
.partial_cmp(&left.similarity)
.unwrap_or(Ordering::Equal)
.then_with(|| right.shared_fingerprints.cmp(&left.shared_fingerprints))
.then_with(|| left.other_file.cmp(&right.other_file))
});
let total = surviving.len();
let start = options.offset.min(total);
let end = start.saturating_add(options.limit).min(total);
let rows = surviving[start..end].to_vec();
let pagination = build_pagination(total, options.offset, options.limit, rows.len());
let empty_reason = if total > 0 {
None
} else if candidate_files == 0 {
Some("no overlapping fingerprints with other files".to_string())
} else {
Some(format!(
"all clone candidates were filtered by min_similarity={:.3}; try lowering the threshold",
options.min_similarity
))
};
let analysis = CloneAnalysis {
self_fingerprint_count: self_count,
candidate_files,
surviving_candidates: surviving_count,
filtered_by_threshold,
max_candidate_similarity,
suggested_min_similarity: max_candidate_similarity.map(|value| (value * 0.9).max(0.0)),
empty_reason,
};
Ok((rows, pagination, analysis))
}
pub fn clone_hotspots(
&self,
file_path: &str,
options: &CloneQueryOptions,
) -> Result<Vec<CloneHotspot>> {
let (rows, _, _) = self.clone_hotspots_page(file_path, options)?;
Ok(rows)
}
pub fn clone_hotspots_page(
&self,
file_path: &str,
options: &CloneQueryOptions,
) -> Result<(Vec<CloneHotspot>, PaginationInfo, CloneAnalysis)> {
let (rows, _, analysis) = self.clone_matches_page(
file_path,
&CloneQueryOptions {
min_similarity: options.min_similarity,
limit: usize::MAX,
offset: 0,
},
)?;
let mut buckets: HashMap<String, (i64, f64, f64)> = HashMap::new();
for row in rows {
let dir = Path::new(&row.other_file)
.parent()
.map(|value| value.to_string_lossy().replace('\\', "/"))
.unwrap_or_else(|| ".".to_string());
let entry = buckets.entry(dir).or_insert((0, 0.0, 0.0));
entry.0 += 1;
entry.1 += row.similarity;
if row.similarity > entry.2 {
entry.2 = row.similarity;
}
}
let mut out = Vec::new();
for (directory, (files, sum_similarity, max_similarity)) in buckets {
out.push(CloneHotspot {
directory,
files,
avg_similarity: if files == 0 {
0.0
} else {
sum_similarity / files as f64
},
max_similarity,
});
}
out.sort_by(|left, right| {
right
.avg_similarity
.partial_cmp(&left.avg_similarity)
.unwrap_or(Ordering::Equal)
.then_with(|| right.files.cmp(&left.files))
.then_with(|| left.directory.cmp(&right.directory))
});
let total = out.len();
let start = options.offset.min(total);
let end = start.saturating_add(options.limit).min(total);
let rows = out[start..end].to_vec();
let pagination = build_pagination(total, options.offset, options.limit, rows.len());
Ok((rows, pagination, analysis))
}
pub fn top_reference_files(
&self,
rows: &[ReferenceLocation],
limit: usize,
) -> Vec<TopFileSummary> {
let mut counts: HashMap<String, i64> = HashMap::new();
for row in rows {
*counts.entry(row.file_path.clone()).or_insert(0) += 1;
}
let mut out: Vec<TopFileSummary> = counts
.into_iter()
.map(|(file_path, count)| TopFileSummary { file_path, count })
.collect();
out.sort_by(|left, right| {
right
.count
.cmp(&left.count)
.then_with(|| left.file_path.cmp(&right.file_path))
});
if limit > 0 && out.len() > limit {
out.truncate(limit);
}
out
}
pub fn selector_suggestions_advanced(
&self,
options: &SelectorSuggestOptions,
) -> Result<Vec<SelectorSuggestion>> {
let query_tokens = tokenize_discovery_query(options.query.as_deref().unwrap_or_default());
let query_lower = options
.query
.as_deref()
.map(str::trim)
.unwrap_or_default()
.to_ascii_lowercase();
let base_fetch = (options.limit.max(1) * 8).min(2000) as i64;
let mut out = self.selector_suggestions_fetch(
options,
&query_lower,
&query_tokens,
options.fuzzy,
base_fetch,
)?;
if out.is_empty() && options.fuzzy && !query_lower.is_empty() {
let widened_fetch = (options.limit.max(1) * 200).min(20000) as i64;
out = self.selector_suggestions_fetch(
options,
&query_lower,
&query_tokens,
false,
widened_fetch,
)?;
}
for suggestion in &mut out {
let (score, why) =
discovery_score(suggestion, &query_lower, &query_tokens, options.fuzzy);
suggestion.score = Some(score);
suggestion.why = Some(why);
}
out.sort_by(|left, right| {
right
.score
.partial_cmp(&left.score)
.unwrap_or(Ordering::Equal)
.then_with(|| entity_rank(&left.entity_type).cmp(&entity_rank(&right.entity_type)))
.then_with(|| left.key.cmp(&right.key))
});
let limit = options.limit.max(1);
if out.len() > limit {
out.truncate(limit);
}
Ok(out)
}
fn selector_suggestions_fetch(
&self,
options: &SelectorSuggestOptions,
query_lower: &str,
query_tokens: &[String],
include_query_filter: bool,
fetch_limit: i64,
) -> Result<Vec<SelectorSuggestion>> {
let mut where_clauses = Vec::new();
let mut params: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
if include_query_filter && !query_lower.is_empty() {
if options.fuzzy && !query_tokens.is_empty() {
let mut token_parts = Vec::new();
for token in query_tokens {
token_parts.push(
"(key LIKE ? OR name LIKE ? OR COALESCE(file_path, '') LIKE ?)".to_string(),
);
let wildcard = format!("%{}%", token.to_ascii_lowercase());
params.push(Box::new(wildcard.clone()));
params.push(Box::new(wildcard.clone()));
params.push(Box::new(wildcard));
}
where_clauses.push(format!("({})", token_parts.join(" OR ")));
} else {
where_clauses.push(
"(key LIKE ? OR name LIKE ? OR COALESCE(file_path, '') LIKE ?)".to_string(),
);
let wildcard = format!("%{}%", query_lower);
params.push(Box::new(wildcard.clone()));
params.push(Box::new(wildcard.clone()));
params.push(Box::new(wildcard));
}
}
if let Some(entity_type) = options.entity_type.as_deref() {
where_clauses.push("entity_type = ?".to_string());
params.push(Box::new(entity_type.to_string()));
}
if let Some(file_glob) = options.file_glob.as_deref() {
where_clauses.push("(file_path IS NULL OR COALESCE(file_path, '') GLOB ?)".to_string());
params.push(Box::new(file_glob.replace('\\', "/")));
}
let where_sql = if where_clauses.is_empty() {
"1=1".to_string()
} else {
where_clauses.join(" AND ")
};
let sql = format!(
"
SELECT entity_type, key, name, file_path, line
FROM entities
WHERE {where_sql}
ORDER BY
CASE entity_type
WHEN 'file' THEN 0
WHEN 'symbol_name' THEN 1
WHEN 'symbol' THEN 2
WHEN 'module' THEN 3
ELSE 9
END,
key
LIMIT ?
"
);
params.push(Box::new(fetch_limit.max(1)));
let bind_params = rusqlite::params_from_iter(params.iter().map(|p| &**p));
let mut stmt = self.conn.prepare(&sql)?;
let rows = stmt.query_map(bind_params, |row| {
Ok(SelectorSuggestion {
entity_type: row.get(0)?,
key: row.get(1)?,
name: row.get(2)?,
file_path: row.get(3)?,
line: row.get(4)?,
score: None,
why: None,
})
})?;
let mut out = Vec::new();
for row in rows {
out.push(row?);
}
Ok(out)
}
pub fn index_warning(&self, stale_after_hours: u64) -> Result<Option<String>> {
let file_count: i64 = self
.conn
.query_row("SELECT COUNT(*) FROM files", [], |row| row.get(0))?;
if file_count == 0 {
return Ok(Some(
"index is empty; run lumora.index_repository before querying".to_string(),
));
}
let latest: Option<String> = self
.conn
.query_row("SELECT MAX(indexed_at) FROM files", [], |row| row.get(0))
.optional()?
.flatten();
let Some(latest) = latest else {
return Ok(Some(
"index timestamp unavailable; results may be partial".to_string(),
));
};
let stale_cutoff = format!("-{stale_after_hours} hours");
let stale: i64 = self.conn.query_row(
"SELECT CASE WHEN ?1 < datetime('now', ?2) THEN 1 ELSE 0 END",
params![latest, stale_cutoff],
|row| row.get(0),
)?;
if stale > 0 {
return Ok(Some(format!(
"index appears stale (latest indexed_at={latest}); consider re-indexing"
)));
}
Ok(None)
}
pub fn freshness_info(&self, stale_after_hours: u64) -> Result<FreshnessInfo> {
let file_count: i64 = self
.conn
.query_row("SELECT COUNT(*) FROM files", [], |row| row.get(0))?;
let latest_indexed_at: Option<String> = self
.conn
.query_row("SELECT MAX(indexed_at) FROM files", [], |row| row.get(0))
.optional()?
.flatten();
let schema_version: Option<String> = self
.conn
.query_row(
"SELECT value FROM meta WHERE key = 'schema_version'",
[],
|row| row.get(0),
)
.optional()?;
let is_stale = if let Some(latest) = latest_indexed_at.as_deref() {
let stale_cutoff = format!("-{stale_after_hours} hours");
let stale: i64 = self.conn.query_row(
"SELECT CASE WHEN ?1 < datetime('now', ?2) THEN 1 ELSE 0 END",
params![latest, stale_cutoff],
|row| row.get(0),
)?;
stale > 0
} else {
true
};
Ok(FreshnessInfo {
file_count,
latest_indexed_at,
schema_version,
stale_after_hours,
is_stale,
})
}
fn definition_files_for_symbol(&self, symbol_name: &str) -> Result<HashSet<String>> {
let mut stmt = self.conn.prepare(
"
SELECT DISTINCT s.file_path
FROM entities sn
JOIN edges en ON en.dst_entity_id = sn.id AND en.edge_type = 'names'
JOIN entities s ON s.id = en.src_entity_id AND s.entity_type = 'symbol'
WHERE sn.entity_type = 'symbol_name' AND sn.name = ?1
",
)?;
let rows = stmt.query_map([symbol_name], |row| row.get::<_, Option<String>>(0))?;
let mut out = HashSet::new();
for row in rows {
if let Some(file) = row? {
out.insert(file);
}
}
Ok(out)
}
fn resolve_selector(&self, selector: &str) -> Result<SelectorLookup> {
let parsed = parse_selector(selector)?;
match parsed {
ParsedSelector::Key(key) => {
let candidate = self.find_entity_by_key(&key)?;
let candidates = candidate.clone().into_iter().collect::<Vec<_>>();
Ok(SelectorLookup {
parsed_as: "key".to_string(),
entity: candidate,
candidates,
})
}
ParsedSelector::File(path) => {
let normalized = normalize_selector_path(&path);
let key = file_key(&normalized);
let candidate = self.find_entity_by_key(&key)?;
let candidates = candidate.clone().into_iter().collect::<Vec<_>>();
Ok(SelectorLookup {
parsed_as: "file".to_string(),
entity: candidate,
candidates,
})
}
ParsedSelector::SymbolName { lang, name } => {
let key = symbol_name_key(&lang, &name);
let candidate = self.find_entity_by_key(&key)?;
let candidates = candidate.clone().into_iter().collect::<Vec<_>>();
Ok(SelectorLookup {
parsed_as: "symbol_name".to_string(),
entity: candidate,
candidates,
})
}
ParsedSelector::Name(name) => {
let candidates = self.entities_by_name(&name)?;
let entity = candidates.first().cloned();
Ok(SelectorLookup {
parsed_as: "name".to_string(),
candidates,
entity,
})
}
ParsedSelector::Auto(raw) => {
let normalized = normalize_selector_path(&raw);
let mut candidates = Vec::new();
if let Some(by_key) = self.find_entity_by_key(&normalized)? {
candidates.push(by_key);
}
if let Some(file_match) = self.find_entity_by_key(&file_key(&normalized))? {
candidates.push(file_match);
}
for by_name in self.entities_by_name(&raw)? {
candidates.push(by_name);
}
dedup_entities_by_id(&mut candidates);
candidates.sort_by(|left, right| {
entity_rank(&left.entity_type)
.cmp(&entity_rank(&right.entity_type))
.then_with(|| left.key.cmp(&right.key))
});
let entity = candidates.first().cloned();
Ok(SelectorLookup {
parsed_as: "auto".to_string(),
candidates,
entity,
})
}
}
}
fn outgoing_neighbors(&self, entity_id: i64) -> Result<Vec<i64>> {
let mut stmt = self
.conn
.prepare("SELECT dst_entity_id FROM edges WHERE src_entity_id = ?1")?;
let rows = stmt.query_map([entity_id], |row| row.get::<_, i64>(0))?;
rows.collect::<std::result::Result<Vec<_>, _>>()
.map_err(Into::into)
}
fn entity_by_id(&self, id: i64) -> Result<Entity> {
self.conn.query_row(
"
SELECT id, entity_type, key, name, lang, file_path, line, col, end_line, end_col, meta_json
FROM entities
WHERE id = ?1
",
[id],
map_entity,
).map_err(Into::into)
}
fn entities_by_name(&self, name: &str) -> Result<Vec<Entity>> {
let mut stmt = self.conn.prepare(
"
SELECT id, entity_type, key, name, lang, file_path, line, col, end_line, end_col, meta_json
FROM entities
WHERE name = ?1
ORDER BY
CASE entity_type
WHEN 'symbol' THEN 0
WHEN 'symbol_name' THEN 1
WHEN 'file' THEN 2
ELSE 3
END,
file_path,
line
LIMIT 32
",
)?;
let rows = stmt.query_map([name], map_entity)?;
rows.collect::<std::result::Result<Vec<_>, _>>()
.map_err(Into::into)
}
fn find_entity_by_key(&self, key: &str) -> Result<Option<Entity>> {
let mut stmt = self.conn.prepare(
"
SELECT id, entity_type, key, name, lang, file_path, line, col, end_line, end_col, meta_json
FROM entities
WHERE key = ?1
LIMIT 1
",
)?;
stmt.query_row([key], map_entity)
.optional()
.map_err(Into::into)
}
fn anchor_symbol_for_line(&self, file_path: &str, line: i64) -> Result<Option<Entity>> {
let mut stmt = self.conn.prepare(
"
SELECT id, entity_type, key, name, lang, file_path, line, col, end_line, end_col, meta_json
FROM entities
WHERE entity_type = 'symbol'
AND file_path = ?1
AND line <= ?2
AND end_line >= ?2
ORDER BY (end_line - line) ASC
LIMIT 1
",
)?;
stmt.query_row(params![file_path, line], map_entity)
.optional()
.map_err(Into::into)
}
fn neighbor_edges(&self, entity_id: i64) -> Result<Vec<RelatedEdge>> {
let mut out = Vec::new();
let mut outgoing = self.conn.prepare(
"
SELECT e.edge_type,
dst.id, dst.entity_type, dst.key, dst.name, dst.lang, dst.file_path,
dst.line, dst.col, dst.end_line, dst.end_col, dst.meta_json
FROM edges e
JOIN entities dst ON dst.id = e.dst_entity_id
WHERE e.src_entity_id = ?1
",
)?;
let outgoing_rows = outgoing.query_map([entity_id], |row| {
Ok(RelatedEdge {
edge_type: row.get(0)?,
direction: "outgoing".to_string(),
entity: Entity {
id: row.get(1)?,
entity_type: row.get(2)?,
key: row.get(3)?,
name: row.get(4)?,
lang: row.get(5)?,
file_path: row.get(6)?,
line: row.get(7)?,
col: row.get(8)?,
end_line: row.get(9)?,
end_col: row.get(10)?,
meta_json: row.get(11)?,
},
depth: None,
score: None,
why: None,
})
})?;
for row in outgoing_rows {
out.push(row?);
}
let mut incoming = self.conn.prepare(
"
SELECT e.edge_type,
src.id, src.entity_type, src.key, src.name, src.lang, src.file_path,
src.line, src.col, src.end_line, src.end_col, src.meta_json
FROM edges e
JOIN entities src ON src.id = e.src_entity_id
WHERE e.dst_entity_id = ?1
",
)?;
let incoming_rows = incoming.query_map([entity_id], |row| {
Ok(RelatedEdge {
edge_type: row.get(0)?,
direction: "incoming".to_string(),
entity: Entity {
id: row.get(1)?,
entity_type: row.get(2)?,
key: row.get(3)?,
name: row.get(4)?,
lang: row.get(5)?,
file_path: row.get(6)?,
line: row.get(7)?,
col: row.get(8)?,
end_line: row.get(9)?,
end_col: row.get(10)?,
meta_json: row.get(11)?,
},
depth: None,
score: None,
why: None,
})
})?;
for row in incoming_rows {
out.push(row?);
}
Ok(out)
}
fn cleanup_orphan_nodes(&mut self) -> Result<()> {
self.conn.execute(
"
DELETE FROM entities
WHERE entity_type IN ('symbol_name', 'module')
AND id NOT IN (SELECT src_entity_id FROM edges)
AND id NOT IN (SELECT dst_entity_id FROM edges)
",
[],
)?;
Ok(())
}
}
#[derive(Debug, Clone)]
enum ParsedSelector {
Key(String),
File(String),
SymbolName { lang: String, name: String },
Name(String),
Auto(String),
}
fn parse_selector(selector: &str) -> Result<ParsedSelector> {
let value = selector.trim();
if value.is_empty() {
anyhow::bail!(
"selector is empty. Examples: file:src/main.rs, symbol_name:rust:run_mcp_stdio, main"
);
}
if let Some(rest) = value.strip_prefix("file:") {
let file = rest.trim();
if file.is_empty() {
anyhow::bail!("invalid `file:` selector: missing path. Example: file:src/main.rs");
}
return Ok(ParsedSelector::File(file.to_string()));
}
if let Some(rest) = value.strip_prefix("symbol_name:") {
let mut parts = rest.splitn(2, ':');
let lang = parts.next().unwrap_or_default().trim();
let name = parts.next().unwrap_or_default().trim();
if lang.is_empty() || name.is_empty() {
anyhow::bail!(
"invalid `symbol_name:` selector. Expected symbol_name:<lang>:<name>, e.g. symbol_name:rust:run_mcp_stdio"
);
}
return Ok(ParsedSelector::SymbolName {
lang: lang.to_string(),
name: name.to_string(),
});
}
if let Some(rest) = value.strip_prefix("symbol:") {
let symbol = rest.trim();
if symbol.is_empty() {
anyhow::bail!("invalid `symbol:` selector: missing name. Example: symbol:main");
}
return Ok(ParsedSelector::Name(symbol.to_string()));
}
if value.starts_with("key:") {
let raw = value.trim_start_matches("key:").trim();
if raw.is_empty() {
anyhow::bail!("invalid `key:` selector: missing key value");
}
return Ok(ParsedSelector::Key(raw.to_string()));
}
if value.starts_with("file:")
|| value.starts_with("symbol_name:")
|| value.starts_with("symbol:")
|| value.starts_with("key:")
{
anyhow::bail!(
"unsupported selector form `{value}`. Examples: file:src/main.rs, symbol_name:rust:main, symbol:main"
);
}
if value.starts_with("module:") || value.starts_with("symbol_name:") {
return Ok(ParsedSelector::Key(value.to_string()));
}
Ok(ParsedSelector::Auto(value.to_string()))
}
fn normalize_selector_path(path: &str) -> String {
path.trim().replace('\\', "/")
}
fn dedup_entities_by_id(items: &mut Vec<Entity>) {
let mut seen = HashSet::new();
items.retain(|item| seen.insert(item.id));
}
fn entity_rank(entity_type: &str) -> i64 {
match entity_type {
"symbol" => 0,
"symbol_name" => 1,
"file" => 2,
"module" => 3,
_ => 9,
}
}
fn reference_sorter(
order: SortOrder,
) -> impl FnMut(&ReferenceLocation, &ReferenceLocation) -> Ordering + Copy {
move |left, right| {
let score_cmp = right
.score
.partial_cmp(&left.score)
.unwrap_or(Ordering::Equal);
let path_cmp = left.file_path.cmp(&right.file_path);
let line_cmp = left.line.cmp(&right.line);
let col_cmp = left.col.cmp(&right.col);
match order {
SortOrder::ScoreDesc => score_cmp
.then_with(|| path_cmp)
.then_with(|| line_cmp)
.then_with(|| col_cmp),
SortOrder::LineAsc => path_cmp.then_with(|| line_cmp).then_with(|| col_cmp),
SortOrder::LineDesc => path_cmp
.reverse()
.then_with(|| line_cmp.reverse())
.then_with(|| col_cmp.reverse()),
}
}
}
fn score_related_edge(edge: &RelatedEdge, depth: usize, prefer_project_symbols: bool) -> f64 {
let edge_weight = match edge.edge_type.as_str() {
"calls" => 2.5,
"depends_on" => 2.2,
"imports" => 2.0,
"defines" => 1.8,
"references" => 1.2,
"names" => 0.8,
"contains" => 0.6,
_ => 1.0,
};
let direction_boost = if edge.direction == "outgoing" {
0.2
} else {
0.0
};
let depth_penalty = (depth as f64 - 1.0) * 0.25;
let mut score = edge_weight + direction_boost - depth_penalty;
if edge.entity.entity_type == "symbol_name" {
if is_low_signal_symbol_name(&edge.entity.name) {
score -= 1.3;
} else if prefer_project_symbols && is_project_local_symbol_name(&edge.entity.name) {
score += 0.35;
}
}
score.max(0.0)
}
fn related_edge_sorter(left: &RelatedEdge, right: &RelatedEdge) -> Ordering {
right
.score
.partial_cmp(&left.score)
.unwrap_or(Ordering::Equal)
.then_with(|| left.edge_type.cmp(&right.edge_type))
.then_with(|| left.direction.cmp(&right.direction))
.then_with(|| left.entity.key.cmp(&right.entity.key))
}
fn build_pagination(total: usize, offset: usize, limit: usize, returned: usize) -> PaginationInfo {
let safe_limit = limit.max(1);
let safe_offset = offset.min(total);
let has_more = safe_offset + returned < total;
let next_offset = if has_more {
Some(safe_offset + returned)
} else {
None
};
PaginationInfo {
total,
offset: safe_offset,
limit: safe_limit,
returned,
has_more,
next_offset,
}
}
fn tokenize_discovery_query(input: &str) -> Vec<String> {
input
.split(|ch: char| !(ch.is_ascii_alphanumeric() || ch == ':' || ch == '/'))
.filter(|part| !part.is_empty())
.map(|part| part.to_ascii_lowercase())
.collect()
}
fn discovery_score(
suggestion: &SelectorSuggestion,
query_lower: &str,
query_tokens: &[String],
fuzzy: bool,
) -> (f64, String) {
let key = suggestion.key.to_ascii_lowercase();
let name = suggestion.name.to_ascii_lowercase();
let path = suggestion
.file_path
.as_deref()
.unwrap_or_default()
.replace('\\', "/")
.to_ascii_lowercase();
let mut score = 0.0;
let mut reasons = Vec::new();
if query_lower.is_empty() {
score = 10.0 - entity_rank(&suggestion.entity_type) as f64;
reasons.push("no_query_default_ranking".to_string());
return (score, reasons.join(","));
}
if name == query_lower {
score += 120.0;
reasons.push("exact_name".to_string());
}
if key == query_lower {
score += 120.0;
reasons.push("exact_key".to_string());
}
if name.starts_with(query_lower) {
score += 70.0;
reasons.push("name_prefix".to_string());
}
if key.starts_with(query_lower) {
score += 60.0;
reasons.push("key_prefix".to_string());
}
if name.contains(query_lower) {
score += 50.0;
reasons.push("name_contains".to_string());
}
if key.contains(query_lower) {
score += 40.0;
reasons.push("key_contains".to_string());
}
if path.contains(query_lower) {
score += 32.0;
reasons.push("path_contains".to_string());
}
let mut token_match_count = 0usize;
for token in query_tokens {
if token.is_empty() {
continue;
}
if name.contains(token) {
score += 10.0;
token_match_count += 1;
}
if key.contains(token) {
score += 8.0;
token_match_count += 1;
}
if path.contains(token) {
score += 6.0;
token_match_count += 1;
}
}
if token_match_count > 0 {
reasons.push(format!("token_matches={token_match_count}"));
}
if fuzzy {
let name_ratio = fuzzy_subsequence_ratio(query_lower, &name);
let key_ratio = fuzzy_subsequence_ratio(query_lower, &key);
let path_ratio = fuzzy_subsequence_ratio(query_lower, &path);
let best = name_ratio.max(key_ratio).max(path_ratio);
if best > 0.0 {
score += best * 25.0;
reasons.push(format!("fuzzy={best:.2}"));
}
}
score += (10 - entity_rank(&suggestion.entity_type)).max(0) as f64 * 0.2;
if reasons.is_empty() {
reasons.push("fallback_rank".to_string());
}
(score, reasons.join(","))
}
fn fuzzy_subsequence_ratio(query: &str, text: &str) -> f64 {
if query.is_empty() || text.is_empty() {
return 0.0;
}
let normalize = |input: &str| {
input
.chars()
.filter(|ch| ch.is_ascii_alphanumeric())
.collect::<String>()
.to_ascii_lowercase()
};
let norm_query = normalize(query);
let norm_text = normalize(text);
if norm_query.is_empty() || norm_text.is_empty() {
return 0.0;
}
if norm_text.contains(&norm_query) {
return 1.0;
}
let qchars: Vec<char> = norm_query.chars().collect();
let mut matched = 0usize;
let mut qidx = 0usize;
for ch in norm_text.chars() {
if qidx < qchars.len() && ch == qchars[qidx] {
matched += 1;
qidx += 1;
}
}
matched as f64 / qchars.len() as f64
}
fn is_low_signal_symbol_name(name: &str) -> bool {
matches!(
name,
"Ok" | "Err"
| "Some"
| "None"
| "Result"
| "Option"
| "String"
| "Vec"
| "Box"
| "Self"
| "self"
)
}
fn is_project_local_symbol_name(name: &str) -> bool {
if is_low_signal_symbol_name(name) {
return false;
}
if name.len() <= 2 {
return false;
}
let lower = name.to_ascii_lowercase();
!matches!(
lower.as_str(),
"string"
| "str"
| "vec"
| "box"
| "result"
| "option"
| "path"
| "pathbuf"
| "hashmap"
| "hashset"
| "usize"
| "u64"
| "i64"
| "bool"
)
}
fn ensure_entity_with_tx(
tx: &rusqlite::Transaction<'_>,
entity_type: &str,
key: &str,
name: &str,
lang: Option<&str>,
file_path: Option<&str>,
line: Option<i64>,
col: Option<i64>,
end_line: Option<i64>,
end_col: Option<i64>,
meta_json: Option<String>,
) -> Result<i64> {
tx.execute(
"
INSERT INTO entities(entity_type, key, name, lang, file_path, line, col, end_line, end_col, meta_json)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)
ON CONFLICT(key) DO UPDATE SET
entity_type=excluded.entity_type,
name=excluded.name,
lang=COALESCE(excluded.lang, entities.lang),
file_path=COALESCE(excluded.file_path, entities.file_path),
line=COALESCE(excluded.line, entities.line),
col=COALESCE(excluded.col, entities.col),
end_line=COALESCE(excluded.end_line, entities.end_line),
end_col=COALESCE(excluded.end_col, entities.end_col),
meta_json=COALESCE(excluded.meta_json, entities.meta_json)
",
params![
entity_type,
key,
name,
lang,
file_path,
line,
col,
end_line,
end_col,
meta_json
],
)?;
tx.query_row("SELECT id FROM entities WHERE key = ?1", [key], |row| {
row.get(0)
})
.map_err(Into::into)
}
fn insert_edge_with_tx(
tx: &rusqlite::Transaction<'_>,
src_entity_id: i64,
dst_entity_id: i64,
edge_type: &str,
file_path: Option<&str>,
line: Option<i64>,
col: Option<i64>,
meta_json: Option<String>,
) -> Result<()> {
tx.execute(
"
INSERT INTO edges(src_entity_id, dst_entity_id, edge_type, file_path, line, col, meta_json)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)
",
params![
src_entity_id,
dst_entity_id,
edge_type,
file_path,
line,
col,
meta_json
],
)?;
Ok(())
}
fn map_entity(row: &rusqlite::Row<'_>) -> rusqlite::Result<Entity> {
Ok(Entity {
id: row.get(0)?,
entity_type: row.get(1)?,
key: row.get(2)?,
name: row.get(3)?,
lang: row.get(4)?,
file_path: row.get(5)?,
line: row.get(6)?,
col: row.get(7)?,
end_line: row.get(8)?,
end_col: row.get(9)?,
meta_json: row.get(10)?,
})
}
pub fn file_key(path: &str) -> String {
format!("file:{path}")
}
pub fn symbol_name_key(lang: &str, symbol_name: &str) -> String {
format!("symbol_name:{lang}:{symbol_name}")
}
pub fn module_key(lang: &str, module_name: &str) -> String {
format!("module:{lang}:{module_name}")
}
fn classify_special_file(path: &str) -> Option<&'static str> {
let lower = path.replace('\\', "/").to_lowercase();
if lower.ends_with("cargo.toml")
|| lower.ends_with("pyproject.toml")
|| lower.ends_with("setup.cfg")
|| lower.ends_with("package.json")
{
return Some("config");
}
if lower.ends_with("/src/main.rs")
|| lower.ends_with("/src/lib.rs")
|| lower.ends_with("/__main__.py")
{
return Some("entrypoint");
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::*;
use tempfile::TempDir;
fn test_store() -> (GraphStore, TempDir) {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let store = GraphStore::open(&db_path).unwrap();
(store, dir)
}
fn sample_extraction() -> FileExtraction {
FileExtraction {
language: LanguageKind::Rust,
definitions: vec![
Definition {
name: "foo".into(),
qualname: "foo".into(),
kind: "function_item".into(),
line: 1,
col: 1,
end_line: 3,
end_col: 1,
},
Definition {
name: "Bar".into(),
qualname: "Bar".into(),
kind: "struct_item".into(),
line: 5,
col: 1,
end_line: 7,
end_col: 1,
},
],
references: vec![
Reference {
name: "Bar".into(),
kind: ReferenceKind::Ref,
line: 2,
col: 5,
end_line: 2,
end_col: 8,
},
Reference {
name: "baz".into(),
kind: ReferenceKind::Call,
line: 2,
col: 10,
end_line: 2,
end_col: 13,
},
],
imports: vec![Import {
module: "std::collections::HashMap".into(),
line: 1,
col: 1,
}],
}
}
fn store_with_sample_data() -> (GraphStore, TempDir) {
let (mut store, dir) = test_store();
let extraction = sample_extraction();
let mut outcome = UpsertOutcome::new();
store
.index_file(
"src/main.rs",
"rust",
"abc123",
100,
&extraction,
&[],
&[],
&mut outcome,
)
.unwrap();
(store, dir)
}
#[test]
fn test_open_creates_schema() {
let dir = TempDir::new().unwrap();
let db_path = dir.path().join("test.db");
let _store1 = GraphStore::open(&db_path).expect("first open should succeed");
let _store2 = GraphStore::open(&db_path).expect("second open should succeed");
}
#[test]
fn test_index_file_and_tracked_hash() {
let (store, _dir) = store_with_sample_data();
let hash = store
.tracked_file_hash("src/main.rs")
.expect("tracked_file_hash should succeed");
assert_eq!(hash.as_deref(), Some("abc123"), "content hash should match");
let files = store.tracked_files().expect("tracked_files should succeed");
assert!(
files.contains("src/main.rs"),
"tracked_files should include src/main.rs"
);
}
#[test]
fn test_remove_files() {
let (mut store, _dir) = store_with_sample_data();
let mut outcome = UpsertOutcome::new();
store
.remove_files(&["src/main.rs".to_string()], &mut outcome)
.expect("remove_files should succeed");
assert_eq!(outcome.removed, 1, "one file should be removed");
let files = store.tracked_files().expect("tracked_files should succeed");
assert!(
files.is_empty(),
"tracked_files should be empty after removal"
);
}
#[test]
fn test_symbol_definitions() {
let (store, _dir) = store_with_sample_data();
let defs = store
.symbol_definitions("foo")
.expect("symbol_definitions should succeed");
assert!(!defs.is_empty(), "should find definition for 'foo'");
assert_eq!(defs[0].file_path, "src/main.rs", "file_path should match");
assert_eq!(
defs[0].kind, "function_item",
"kind should be function_item"
);
assert_eq!(defs[0].end_line, Some(3), "end_line should be preserved");
}
#[test]
fn test_symbol_definitions_nonexistent() {
let (store, _dir) = store_with_sample_data();
let defs = store
.symbol_definitions("nonexistent")
.expect("symbol_definitions should succeed for missing symbol");
assert!(
defs.is_empty(),
"should return empty for nonexistent symbol"
);
}
#[test]
fn test_symbol_references_page() {
let (store, _dir) = store_with_sample_data();
let options = ReferenceQueryOptions::default();
let (rows, pagination) = store
.symbol_references_page("Bar", &options)
.expect("symbol_references_page should succeed");
assert!(!rows.is_empty(), "should find references for 'Bar'");
assert!(pagination.total > 0, "total should be positive");
assert_eq!(
rows[0].edge_type, "references",
"edge_type should be references"
);
}
#[test]
fn test_symbol_references_page_calls_filter() {
let (store, _dir) = store_with_sample_data();
let options = ReferenceQueryOptions {
edge_type_filter: Some("calls".into()),
..Default::default()
};
let (rows, _pagination) = store
.symbol_references_page("baz", &options)
.expect("symbol_references_page with calls filter should succeed");
for row in &rows {
assert_eq!(row.edge_type, "calls", "all results should be calls");
}
}
#[test]
fn test_symbol_references_order_variants() {
let (store, _dir) = store_with_sample_data();
let orders = [
SortOrder::ScoreDesc,
SortOrder::LineAsc,
SortOrder::LineDesc,
];
let mut counts = Vec::new();
for order in &orders {
let options = ReferenceQueryOptions {
order: *order,
..Default::default()
};
let (rows, _) = store
.symbol_references_page("Bar", &options)
.expect("symbol_references_page should succeed for all orders");
counts.push(rows.len());
}
assert_eq!(
counts[0], counts[1],
"ScoreDesc and LineAsc should return same count"
);
assert_eq!(
counts[1], counts[2],
"LineAsc and LineDesc should return same count"
);
}
#[test]
fn test_dependency_path_not_found() {
let (store, _dir) = store_with_sample_data();
let result = store
.dependency_path("file:nonexistent.rs", "file:also_nonexistent.rs", 5)
.expect("dependency_path should succeed even for missing entities");
assert!(
!result.found,
"should not find path between nonexistent files"
);
}
#[test]
fn test_dependency_path_same_entity() {
let (store, _dir) = store_with_sample_data();
let result = store
.dependency_path("file:src/main.rs", "file:src/main.rs", 5)
.expect("dependency_path for same entity should succeed");
assert!(result.found, "should find path to self");
assert_eq!(
result.hops.len(),
1,
"self-path should have exactly one hop"
);
}
#[test]
fn test_minimal_slice_with_options() {
let (store, _dir) = store_with_sample_data();
let result = store
.minimal_slice_with_options("src/main.rs", None, 2, &SliceQueryOptions::default())
.expect("minimal_slice_with_options should succeed");
assert!(result.is_some(), "should return a slice for indexed file");
}
#[test]
fn test_minimal_slice_missing_file() {
let (store, _dir) = store_with_sample_data();
let result = store
.minimal_slice_with_options("nonexistent.rs", None, 2, &SliceQueryOptions::default())
.expect("minimal_slice for missing file should succeed");
assert!(result.is_none(), "should return None for nonexistent file");
}
#[test]
fn test_clone_matches_no_fingerprints() {
let (store, _dir) = store_with_sample_data();
let options = CloneQueryOptions::default();
let (rows, _pagination, analysis) = store
.clone_matches_page("src/main.rs", &options)
.expect("clone_matches_page should succeed with no fingerprints");
assert!(rows.is_empty(), "no clone matches without fingerprints");
assert!(
analysis.empty_reason.is_some(),
"should have empty_reason when no fingerprints"
);
}
#[test]
fn test_clone_matches_with_fingerprints() {
let (mut store, _dir) = test_store();
let extraction = sample_extraction();
let mut outcome = UpsertOutcome::new();
store
.index_file(
"src/a.rs",
"rust",
"hash_a",
100,
&extraction,
&[(100, 0, 10), (200, 10, 20)],
&[],
&mut outcome,
)
.unwrap();
store
.index_file(
"src/b.rs",
"rust",
"hash_b",
100,
&extraction,
&[(100, 0, 10), (300, 10, 20)],
&[],
&mut outcome,
)
.unwrap();
let options = CloneQueryOptions {
min_similarity: 0.0,
..Default::default()
};
let (rows, _pagination, _analysis) = store
.clone_matches_page("src/a.rs", &options)
.expect("clone_matches_page should succeed with shared fingerprints");
assert!(
!rows.is_empty(),
"should find clone matches with shared fingerprints"
);
}
#[test]
fn test_clone_hotspots() {
let (mut store, _dir) = test_store();
let extraction = sample_extraction();
let mut outcome = UpsertOutcome::new();
store
.index_file(
"src/a.rs",
"rust",
"hash_a",
100,
&extraction,
&[(100, 0, 10), (200, 10, 20)],
&[],
&mut outcome,
)
.unwrap();
store
.index_file(
"src/b.rs",
"rust",
"hash_b",
100,
&extraction,
&[(100, 0, 10), (300, 10, 20)],
&[],
&mut outcome,
)
.unwrap();
let options = CloneQueryOptions {
min_similarity: 0.0,
..Default::default()
};
let hotspots = store
.clone_hotspots("src/a.rs", &options)
.expect("clone_hotspots should succeed");
assert!(!hotspots.is_empty(), "should find at least one hotspot");
}
#[test]
fn test_selector_suggestions_advanced() {
let (store, _dir) = store_with_sample_data();
let options = SelectorSuggestOptions {
query: Some("foo".into()),
fuzzy: true,
limit: 10,
..Default::default()
};
let results = store
.selector_suggestions_advanced(&options)
.expect("selector_suggestions_advanced should succeed");
assert!(!results.is_empty(), "should find suggestions for 'foo'");
}
#[test]
fn test_selector_suggestions_no_query() {
let (store, _dir) = store_with_sample_data();
let options = SelectorSuggestOptions {
limit: 10,
..Default::default()
};
let results = store
.selector_suggestions_advanced(&options)
.expect("selector_suggestions_advanced without query should succeed");
assert!(
!results.is_empty(),
"should return entities even without query"
);
}
#[test]
fn test_top_reference_files() {
let (store, _dir) = test_store();
let refs = vec![
ReferenceLocation {
symbol_name: "x".into(),
file_path: "a.rs".into(),
line: 1,
col: 1,
edge_type: "references".into(),
score: None,
why: None,
},
ReferenceLocation {
symbol_name: "x".into(),
file_path: "a.rs".into(),
line: 2,
col: 1,
edge_type: "references".into(),
score: None,
why: None,
},
ReferenceLocation {
symbol_name: "x".into(),
file_path: "b.rs".into(),
line: 1,
col: 1,
edge_type: "calls".into(),
score: None,
why: None,
},
];
let summary = store.top_reference_files(&refs, 10);
assert_eq!(summary.len(), 2, "should have 2 files");
assert_eq!(
summary[0].file_path, "a.rs",
"a.rs should be first (count=2)"
);
assert_eq!(summary[0].count, 2, "a.rs should have count=2");
}
#[test]
fn test_build_pagination_basic() {
let p = build_pagination(10, 0, 5, 5);
assert_eq!(p.total, 10, "total should be 10");
assert!(p.has_more, "should have more pages");
assert_eq!(p.next_offset, Some(5), "next_offset should be 5");
}
#[test]
fn test_build_pagination_no_more() {
let p = build_pagination(5, 0, 10, 5);
assert!(!p.has_more, "should not have more pages");
assert_eq!(p.next_offset, None, "next_offset should be None");
}
#[test]
fn test_build_pagination_offset_past_end() {
let p = build_pagination(5, 100, 10, 0);
assert_eq!(p.offset, 5, "offset should be clamped to total");
assert!(!p.has_more, "should not have more pages");
}
#[test]
fn test_parse_selector_file() {
let result = parse_selector("file:src/main.rs").expect("should parse file selector");
match result {
ParsedSelector::File(p) => assert_eq!(p, "src/main.rs", "path should match"),
other => panic!("expected File variant, got {:?}", other),
}
}
#[test]
fn test_parse_selector_symbol_name() {
let result =
parse_selector("symbol_name:rust:foo").expect("should parse symbol_name selector");
match result {
ParsedSelector::SymbolName { lang, name } => {
assert_eq!(lang, "rust", "lang should be rust");
assert_eq!(name, "foo", "name should be foo");
}
other => panic!("expected SymbolName variant, got {:?}", other),
}
}
#[test]
fn test_parse_selector_symbol() {
let result = parse_selector("symbol:main").expect("should parse symbol selector");
match result {
ParsedSelector::Name(n) => assert_eq!(n, "main", "name should be main"),
other => panic!("expected Name variant, got {:?}", other),
}
}
#[test]
fn test_parse_selector_key() {
let result = parse_selector("key:some_key").expect("should parse key selector");
match result {
ParsedSelector::Key(k) => assert_eq!(k, "some_key", "key should match"),
other => panic!("expected Key variant, got {:?}", other),
}
}
#[test]
fn test_parse_selector_auto() {
let result = parse_selector("main").expect("should parse auto selector");
match result {
ParsedSelector::Auto(v) => assert_eq!(v, "main", "auto value should match"),
other => panic!("expected Auto variant, got {:?}", other),
}
}
#[test]
fn test_parse_selector_empty_errors() {
assert!(parse_selector("").is_err(), "empty selector should error");
assert!(
parse_selector("file:").is_err(),
"file: with no path should error"
);
assert!(
parse_selector("symbol_name:rust:").is_err(),
"symbol_name with empty name should error"
);
assert!(
parse_selector("symbol:").is_err(),
"symbol: with no name should error"
);
}
#[test]
fn test_file_key() {
assert_eq!(
file_key("src/main.rs"),
"file:src/main.rs",
"file_key format"
);
}
#[test]
fn test_symbol_name_key() {
assert_eq!(
symbol_name_key("rust", "foo"),
"symbol_name:rust:foo",
"symbol_name_key format"
);
}
#[test]
fn test_module_key() {
assert_eq!(
module_key("rust", "std::io"),
"module:rust:std::io",
"module_key format"
);
}
#[test]
fn test_classify_special_file() {
assert_eq!(
classify_special_file("project/Cargo.toml"),
Some("config"),
"Cargo.toml should be config"
);
assert_eq!(
classify_special_file("project/src/main.rs"),
Some("entrypoint"),
"src/main.rs should be entrypoint"
);
assert_eq!(
classify_special_file("project/src/lib.rs"),
Some("entrypoint"),
"src/lib.rs should be entrypoint"
);
assert_eq!(
classify_special_file("project/__main__.py"),
Some("entrypoint"),
"__main__.py should be entrypoint"
);
assert_eq!(
classify_special_file("random.rs"),
None,
"random.rs should be None"
);
}
#[test]
fn test_is_low_signal_symbol_name() {
assert!(is_low_signal_symbol_name("Ok"), "Ok is low signal");
assert!(is_low_signal_symbol_name("Some"), "Some is low signal");
assert!(
!is_low_signal_symbol_name("GraphStore"),
"GraphStore is not low signal"
);
}
#[test]
fn test_is_project_local_symbol_name() {
assert!(
is_project_local_symbol_name("GraphStore"),
"GraphStore should be project local"
);
assert!(
!is_project_local_symbol_name("Ok"),
"Ok should not be project local (low signal)"
);
assert!(
!is_project_local_symbol_name("x"),
"single char should not be project local (too short)"
);
assert!(
!is_project_local_symbol_name("String"),
"String should not be project local (stdlib)"
);
}
#[test]
fn test_fuzzy_subsequence_ratio() {
assert_eq!(
fuzzy_subsequence_ratio("abc", "abc"),
1.0,
"exact match should be 1.0"
);
assert!(
fuzzy_subsequence_ratio("xyz", "abc") < 1.0,
"non-matching should be < 1.0"
);
assert_eq!(
fuzzy_subsequence_ratio("", "abc"),
0.0,
"empty query should be 0.0"
);
assert_eq!(
fuzzy_subsequence_ratio("abc", ""),
0.0,
"empty text should be 0.0"
);
}
#[test]
fn test_freshness_info_empty_db() {
let (store, _dir) = test_store();
let info = store
.freshness_info(24)
.expect("freshness_info should succeed");
assert!(info.is_stale, "empty db should be stale");
assert_eq!(info.file_count, 0, "empty db should have 0 files");
}
#[test]
fn test_freshness_info_after_indexing() {
let (store, _dir) = store_with_sample_data();
let info = store
.freshness_info(24)
.expect("freshness_info should succeed");
assert_eq!(info.file_count, 1, "should have 1 file after indexing");
assert!(!info.is_stale, "freshly indexed db should not be stale");
}
#[test]
fn test_index_warning_empty_db() {
let (store, _dir) = test_store();
let warning = store
.index_warning(24)
.expect("index_warning should succeed");
assert!(warning.is_some(), "empty db should produce a warning");
assert!(
warning.unwrap().contains("empty"),
"warning should mention empty index"
);
}
#[test]
fn test_index_warning_after_indexing() {
let (store, _dir) = store_with_sample_data();
let warning = store
.index_warning(24)
.expect("index_warning should succeed");
assert!(
warning.is_none(),
"freshly indexed db should not produce a warning"
);
}
}