use crate::core::registry::{IndexHandle, IndexId};
use dashmap::DashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
pub(super) fn to_corpus_relative_path(root: &Path, path: &Path) -> String {
path.strip_prefix(root)
.unwrap_or(path)
.display()
.to_string()
}
pub(super) async fn prune_deleted_files_from_staging(
handle: &IndexHandle,
walked_files: &[PathBuf],
canonical_root: &Path,
hashes: &Arc<DashMap<PathBuf, String>>,
index_id: &IndexId,
) {
let walked_set: std::collections::HashSet<String> = walked_files
.iter()
.map(|p| to_corpus_relative_path(canonical_root, p))
.collect();
let corpus = {
let indexer = handle.indexer.read().await;
indexer.corpus_store()
};
let Some(corpus) = corpus else {
return; };
let indexed_files = match tokio::task::spawn_blocking(move || corpus.list_indexed_files()).await
{
Ok(Ok(files)) => files,
Ok(Err(e)) => {
tracing::warn!(
"reindex[{}]: prune pass: could not list indexed files ({e}) — \
skipping prune",
index_id.0
);
return;
}
Err(e) => {
tracing::warn!(
"reindex[{}]: prune pass: list_indexed_files task panicked ({e}) — \
skipping prune",
index_id.0
);
return;
}
};
let deleted_files: Vec<String> = indexed_files
.into_iter()
.filter(|f| !walked_set.contains(f.as_str()))
.collect();
if deleted_files.is_empty() {
tracing::debug!(
"reindex[{}]: prune pass: no deleted files detected",
index_id.0
);
return;
}
tracing::info!(
"reindex[{}]: prune pass: {} deleted file(s) detected — pruning stale data",
index_id.0,
deleted_files.len()
);
let mut total_pruned_chunks: usize = 0;
let mut pruned_paths_for_hash: Vec<String> = Vec::new();
let mut failed_count: usize = 0;
for file_path in &deleted_files {
let absolute = canonical_root.join(file_path);
if absolute.exists() {
tracing::warn!(
"reindex[{}]: prune: skipping {} — still exists on disk, \
likely a path-normalisation mismatch; will NOT prune live data",
index_id.0,
file_path,
);
continue;
}
let n = {
let indexer = handle.indexer.read().await;
match indexer.remove_file_no_kg_rebuild(file_path).await {
Ok(count) => count,
Err(e) => {
tracing::warn!(
"reindex[{}]: prune pass: remove_file_no_kg_rebuild for {} failed ({e})",
index_id.0,
file_path,
);
failed_count += 1;
0
}
}
};
total_pruned_chunks += n;
hashes.remove(&PathBuf::from(file_path));
pruned_paths_for_hash.push(file_path.clone());
tracing::debug!(
"reindex[{}]: prune pass: removed {} stale chunk(s) for deleted file {}",
index_id.0,
n,
file_path,
);
}
if !pruned_paths_for_hash.is_empty() {
let corpus = {
let indexer = handle.indexer.read().await;
indexer.corpus_store()
};
if let Some(corpus) = corpus {
let paths = pruned_paths_for_hash.clone();
let idx = index_id.0.clone();
match tokio::task::spawn_blocking(move || corpus.delete_file_hash_entries(&paths)).await
{
Ok(Ok(())) => {}
Ok(Err(e)) => {
tracing::warn!("reindex[{idx}]: prune pass: batched hash delete failed ({e})");
failed_count += 1;
}
Err(e) => {
tracing::warn!(
"reindex[{idx}]: prune pass: batched hash delete task panicked ({e})"
);
failed_count += 1;
}
}
}
}
if failed_count > 0 {
tracing::warn!(
"reindex[{}]: prune pass: {} file(s) failed to fully prune — \
ghost chunks may persist until next reindex",
index_id.0,
failed_count,
);
}
tracing::info!(
"reindex[{}]: prune pass: pruned {} stale chunk(s) from {} deleted file(s)",
index_id.0,
total_pruned_chunks,
pruned_paths_for_hash.len(),
);
}
#[cfg(test)]
#[path = "prune_tests.rs"]
mod prune_tests;