use std::collections::HashSet;
use std::path::PathBuf;
use crate::store::helpers::{StaleFile, StaleReport, StoreError};
use crate::store::Store;
#[derive(Debug, Clone)]
pub struct PruneAllResult {
pub pruned_chunks: u32,
pub pruned_calls: u64,
pub pruned_type_edges: u64,
pub pruned_summaries: usize,
}
impl Store {
pub fn prune_missing(&self, existing_files: &HashSet<PathBuf>) -> Result<u32, StoreError> {
let _span = tracing::info_span!("prune_missing", existing = existing_files.len()).entered();
self.rt.block_on(async {
let rows: Vec<(String,)> = sqlx::query_as(
"SELECT DISTINCT origin FROM chunks WHERE source_type = 'file'",
)
.fetch_all(&self.pool)
.await?;
let missing: Vec<String> = rows
.into_iter()
.filter(|(origin,)| {
let origin_path = PathBuf::from(origin);
#[cfg(target_os = "macos")]
{
let origin_lower = origin.to_lowercase();
!existing_files
.iter()
.any(|p| p.to_string_lossy().to_lowercase() == origin_lower)
}
#[cfg(not(target_os = "macos"))]
{
if existing_files.contains(&origin_path) {
false } else {
!existing_files.iter().any(|p| {
let p_str = p.to_string_lossy();
let o_str = origin_path.to_string_lossy();
p_str.ends_with(o_str.as_ref()) || o_str.ends_with(p_str.as_ref())
})
}
}
})
.map(|(origin,)| origin)
.collect();
if missing.is_empty() {
return Ok(0);
}
const BATCH_SIZE: usize = 100;
let mut deleted = 0u32;
let mut tx = self.pool.begin().await?;
for batch in missing.chunks(BATCH_SIZE) {
let placeholder_str = crate::store::helpers::make_placeholders(batch.len());
let fts_query = format!(
"DELETE FROM chunks_fts WHERE id IN (SELECT id FROM chunks WHERE origin IN ({}))",
placeholder_str
);
let mut fts_stmt = sqlx::query(&fts_query);
for origin in batch {
fts_stmt = fts_stmt.bind(origin);
}
fts_stmt.execute(&mut *tx).await?;
let chunks_query =
format!("DELETE FROM chunks WHERE origin IN ({})", placeholder_str);
let mut chunks_stmt = sqlx::query(&chunks_query);
for origin in batch {
chunks_stmt = chunks_stmt.bind(origin);
}
let result = chunks_stmt.execute(&mut *tx).await?;
deleted += result.rows_affected() as u32;
}
if deleted > 0 {
let sparse_result = sqlx::query(
"DELETE FROM sparse_vectors WHERE chunk_id NOT IN \
(SELECT id FROM chunks)",
)
.execute(&mut *tx)
.await?;
let pruned_sparse = sparse_result.rows_affected();
if pruned_sparse > 0 {
tracing::debug!(pruned_sparse, "Pruned orphan sparse vectors in prune_missing tx");
}
}
tx.commit().await?;
if deleted > 0 {
tracing::info!(deleted, files = missing.len(), "Pruned chunks for missing files");
}
Ok(deleted)
})
}
pub fn prune_all(
&self,
existing_files: &HashSet<PathBuf>,
) -> Result<PruneAllResult, StoreError> {
let _span = tracing::info_span!("prune_all", existing = existing_files.len()).entered();
self.rt.block_on(async {
let rows: Vec<(String,)> = sqlx::query_as(
"SELECT DISTINCT origin FROM chunks WHERE source_type = 'file'",
)
.fetch_all(&self.pool)
.await?;
let missing: Vec<String> = rows
.into_iter()
.filter(|(origin,)| {
let origin_path = PathBuf::from(origin);
#[cfg(target_os = "macos")]
{
let origin_lower = origin.to_lowercase();
!existing_files
.iter()
.any(|p| p.to_string_lossy().to_lowercase() == origin_lower)
}
#[cfg(not(target_os = "macos"))]
{
!existing_files.contains(&origin_path)
}
})
.map(|(origin,)| origin)
.collect();
let mut tx = self.pool.begin().await?;
const BATCH_SIZE: usize = 100;
let mut pruned_chunks = 0u32;
for batch in missing.chunks(BATCH_SIZE) {
let placeholder_str = crate::store::helpers::make_placeholders(batch.len());
let fts_query = format!(
"DELETE FROM chunks_fts WHERE id IN (SELECT id FROM chunks WHERE origin IN ({}))",
placeholder_str
);
let mut fts_stmt = sqlx::query(&fts_query);
for origin in batch {
fts_stmt = fts_stmt.bind(origin);
}
fts_stmt.execute(&mut *tx).await?;
let chunks_query =
format!("DELETE FROM chunks WHERE origin IN ({})", placeholder_str);
let mut chunks_stmt = sqlx::query(&chunks_query);
for origin in batch {
chunks_stmt = chunks_stmt.bind(origin);
}
let result = chunks_stmt.execute(&mut *tx).await?;
pruned_chunks += result.rows_affected() as u32;
}
let calls_result = sqlx::query(
"DELETE FROM function_calls WHERE file NOT IN (SELECT DISTINCT origin FROM chunks)",
)
.execute(&mut *tx)
.await?;
let pruned_calls = calls_result.rows_affected();
let types_result = sqlx::query(
"DELETE FROM type_edges WHERE source_chunk_id NOT IN (SELECT id FROM chunks)",
)
.execute(&mut *tx)
.await?;
let pruned_type_edges = types_result.rows_affected();
let summaries_result = sqlx::query(
"DELETE FROM llm_summaries WHERE content_hash NOT IN \
(SELECT DISTINCT content_hash FROM chunks)",
)
.execute(&mut *tx)
.await?;
let pruned_summaries = summaries_result.rows_affected() as usize;
let sparse_result = sqlx::query(
"DELETE FROM sparse_vectors WHERE chunk_id NOT IN \
(SELECT id FROM chunks)",
)
.execute(&mut *tx)
.await?;
let pruned_sparse = sparse_result.rows_affected() as usize;
if pruned_sparse > 0 {
tracing::debug!(pruned_sparse, "Pruned orphan sparse vectors in prune_all tx");
}
tx.commit().await?;
if pruned_chunks > 0 {
tracing::info!(pruned_chunks, files = missing.len(), "Pruned chunks for missing files");
}
if pruned_calls > 0 {
tracing::info!(pruned_calls, "Pruned stale call graph entries");
}
if pruned_type_edges > 0 {
tracing::info!(pruned_type_edges, "Pruned stale type edges");
}
if pruned_summaries > 0 {
tracing::info!(pruned_summaries, "Pruned orphan LLM summaries");
}
Ok(PruneAllResult {
pruned_chunks,
pruned_calls,
pruned_type_edges,
pruned_summaries,
})
})
}
pub fn count_stale_files(
&self,
existing_files: &HashSet<PathBuf>,
) -> Result<(u64, u64), StoreError> {
let _span = tracing::debug_span!("count_stale_files").entered();
let report = self.list_stale_files(existing_files)?;
Ok((report.stale.len() as u64, report.missing.len() as u64))
}
pub fn list_stale_files(
&self,
existing_files: &HashSet<PathBuf>,
) -> Result<StaleReport, StoreError> {
let _span = tracing::debug_span!("list_stale_files").entered();
self.rt.block_on(async {
let rows: Vec<(String, Option<i64>)> = sqlx::query_as(
"SELECT DISTINCT origin, source_mtime FROM chunks WHERE source_type = 'file'",
)
.fetch_all(&self.pool)
.await?;
let total_indexed = rows.len() as u64;
let mut stale = Vec::new();
let mut missing = Vec::new();
for (origin, stored_mtime) in rows {
let path = PathBuf::from(&origin);
if !existing_files.contains(&path) {
missing.push(path);
continue;
}
let stored = match stored_mtime {
Some(m) => m,
None => {
stale.push(StaleFile {
file: path,
stored_mtime: 0,
current_mtime: 0,
});
continue;
}
};
let current_mtime = path
.metadata()
.and_then(|m| m.modified())
.ok()
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.map(|d| d.as_millis() as i64);
if let Some(current) = current_mtime {
if current > stored {
stale.push(StaleFile {
file: path,
stored_mtime: stored,
current_mtime: current,
});
}
}
}
Ok(StaleReport {
stale,
missing,
total_indexed,
})
})
}
pub fn check_origins_stale(
&self,
origins: &[&str],
root: &std::path::Path,
) -> Result<HashSet<String>, StoreError> {
let _span = tracing::info_span!("check_origins_stale", count = origins.len()).entered();
if origins.is_empty() {
return Ok(HashSet::new());
}
self.rt.block_on(async {
let mut stale = HashSet::new();
const BATCH_SIZE: usize = 900;
for batch in origins.chunks(BATCH_SIZE) {
let placeholders = crate::store::helpers::make_placeholders(batch.len());
let sql = format!(
"SELECT origin, source_mtime FROM chunks WHERE origin IN ({}) GROUP BY origin",
placeholders
);
let mut query = sqlx::query_as::<_, (String, Option<i64>)>(&sql);
for origin in batch {
query = query.bind(*origin);
}
let rows = query.fetch_all(&self.pool).await?;
for (origin, stored_mtime) in rows {
let stored = match stored_mtime {
Some(m) => m,
None => {
stale.insert(origin);
continue;
}
};
debug_assert!(
!origin.contains('\\'),
"DB origin contains backslash: {origin}"
);
let path = PathBuf::from(crate::normalize_path(&root.join(&origin)));
let current_mtime = path
.metadata()
.and_then(|m| m.modified())
.ok()
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.map(|d| d.as_millis() as i64);
if let Some(current) = current_mtime {
if current > stored {
stale.insert(origin);
}
} else {
stale.insert(origin);
}
}
}
Ok(stale)
})
}
}
#[cfg(test)]
mod tests {
use super::super::test_utils::make_chunk;
use crate::parser::{Chunk, ChunkType, Language};
use crate::test_helpers::{mock_embedding, setup_store};
use std::collections::HashSet;
#[test]
fn test_list_stale_files_empty_index() {
let (store, _dir) = setup_store();
let existing = HashSet::new();
let report = store.list_stale_files(&existing).unwrap();
assert!(report.stale.is_empty());
assert!(report.missing.is_empty());
assert_eq!(report.total_indexed, 0);
}
#[test]
fn test_list_stale_files_all_fresh() {
let (store, dir) = setup_store();
let file_path = dir.path().join("src/fresh.rs");
std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
std::fs::write(&file_path, "fn fresh() {}").unwrap();
let origin = file_path.to_string_lossy().to_string();
let c = Chunk {
id: format!("{}:1:abc", origin),
file: file_path.clone(),
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "fresh".to_string(),
signature: "fn fresh()".to_string(),
content: "fn fresh() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "abc".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
let mtime = file_path
.metadata()
.unwrap()
.modified()
.unwrap()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis() as i64;
store
.upsert_chunks_batch(&[(c, mock_embedding(1.0))], Some(mtime))
.unwrap();
let mut existing = HashSet::new();
existing.insert(file_path);
let report = store.list_stale_files(&existing).unwrap();
assert!(report.stale.is_empty());
assert!(report.missing.is_empty());
assert_eq!(report.total_indexed, 1);
}
#[test]
fn test_list_stale_files_detects_modified() {
let (store, dir) = setup_store();
let file_path = dir.path().join("src/stale.rs");
std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
std::fs::write(&file_path, "fn stale() {}").unwrap();
let origin = file_path.to_string_lossy().to_string();
let c = Chunk {
id: format!("{}:1:abc", origin),
file: file_path.clone(),
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "stale".to_string(),
signature: "fn stale()".to_string(),
content: "fn stale() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "abc".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
store
.upsert_chunks_batch(&[(c, mock_embedding(1.0))], Some(1000))
.unwrap();
let mut existing = HashSet::new();
existing.insert(file_path);
let report = store.list_stale_files(&existing).unwrap();
assert_eq!(report.stale.len(), 1);
assert_eq!(report.stale[0].stored_mtime, 1000);
assert!(report.stale[0].current_mtime > 1000);
assert!(report.missing.is_empty());
assert_eq!(report.total_indexed, 1);
}
#[test]
fn test_list_stale_files_detects_missing() {
let (store, _dir) = setup_store();
let c = make_chunk("gone", "/nonexistent/file.rs");
store
.upsert_chunks_batch(&[(c, mock_embedding(1.0))], Some(1000))
.unwrap();
let existing = HashSet::new();
let report = store.list_stale_files(&existing).unwrap();
assert!(report.stale.is_empty());
assert_eq!(report.missing.len(), 1);
assert_eq!(report.total_indexed, 1);
}
#[test]
fn test_list_stale_files_null_mtime() {
let (store, dir) = setup_store();
let file_path = dir.path().join("src/null.rs");
std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
std::fs::write(&file_path, "fn null() {}").unwrap();
let origin = file_path.to_string_lossy().to_string();
let c = Chunk {
id: format!("{}:1:abc", origin),
file: file_path.clone(),
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "null".to_string(),
signature: "fn null()".to_string(),
content: "fn null() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "abc".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
store
.upsert_chunks_batch(&[(c, mock_embedding(1.0))], None)
.unwrap();
let mut existing = HashSet::new();
existing.insert(file_path);
let report = store.list_stale_files(&existing).unwrap();
assert_eq!(
report.stale.len(),
1,
"NULL mtime should be treated as stale"
);
}
#[test]
fn test_check_origins_stale_empty_list() {
let (store, _dir) = setup_store();
let stale = store
.check_origins_stale(&[], std::path::Path::new("/"))
.unwrap();
assert!(stale.is_empty());
}
#[test]
fn test_check_origins_stale_all_fresh() {
let (store, dir) = setup_store();
let file_path = dir.path().join("src/fresh.rs");
std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
std::fs::write(&file_path, "fn fresh() {}").unwrap();
let origin = file_path.to_string_lossy().to_string();
let c = Chunk {
id: format!("{}:1:abc", origin),
file: file_path.clone(),
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "fresh".to_string(),
signature: "fn fresh()".to_string(),
content: "fn fresh() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "abc".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
let mtime = file_path
.metadata()
.unwrap()
.modified()
.unwrap()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis() as i64;
store
.upsert_chunks_batch(&[(c, mock_embedding(1.0))], Some(mtime))
.unwrap();
let stale = store.check_origins_stale(&[&origin], dir.path()).unwrap();
assert!(stale.is_empty());
}
#[test]
fn test_check_origins_stale_mixed() {
let (store, dir) = setup_store();
let fresh_path = dir.path().join("src/fresh.rs");
std::fs::create_dir_all(fresh_path.parent().unwrap()).unwrap();
std::fs::write(&fresh_path, "fn fresh() {}").unwrap();
let fresh_origin = fresh_path.to_string_lossy().to_string();
let c_fresh = Chunk {
id: format!("{}:1:fresh", fresh_origin),
file: fresh_path.clone(),
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "fresh".to_string(),
signature: "fn fresh()".to_string(),
content: "fn fresh() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "fresh".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
let fresh_mtime = fresh_path
.metadata()
.unwrap()
.modified()
.unwrap()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis() as i64;
store
.upsert_chunks_batch(&[(c_fresh, mock_embedding(1.0))], Some(fresh_mtime))
.unwrap();
let stale_path = dir.path().join("src/stale.rs");
std::fs::write(&stale_path, "fn stale() {}").unwrap();
let stale_origin = stale_path.to_string_lossy().to_string();
let c_stale = Chunk {
id: format!("{}:1:stale", stale_origin),
file: stale_path,
language: Language::Rust,
chunk_type: ChunkType::Function,
name: "stale".to_string(),
signature: "fn stale()".to_string(),
content: "fn stale() {}".to_string(),
doc: None,
line_start: 1,
line_end: 1,
content_hash: "stale".to_string(),
parent_id: None,
window_idx: None,
parent_type_name: None,
};
store
.upsert_chunks_batch(&[(c_stale, mock_embedding(2.0))], Some(1000))
.unwrap();
let stale = store
.check_origins_stale(&[&fresh_origin, &stale_origin], dir.path())
.unwrap();
assert_eq!(stale.len(), 1);
assert!(stale.contains(&stale_origin));
assert!(!stale.contains(&fresh_origin));
}
#[test]
fn test_check_origins_stale_unknown_origin() {
let (store, _dir) = setup_store();
let stale = store
.check_origins_stale(&["nonexistent/file.rs"], std::path::Path::new("/"))
.unwrap();
assert!(
stale.is_empty(),
"Unknown origin should not appear in stale set"
);
}
}