use std::path::Path;
use ahash::AHashSet;
use serde::Serialize;
use thiserror::Error;
use crate::store::{
BLOBS_DIR, INDEX_FILE, StoreError, VIEWS_DIR, acquire_lock, read_index, wipe_blobs,
};
const BLOB_SUFFIXES: [&str; 3] = [".l1.msgpack", ".l2.msgpack", ".doc.msgpack"];
const TELEMETRY_FILENAME: &str = "telemetry.jsonl";
#[derive(Debug, Error)]
pub enum GcError {
#[error(transparent)]
Store(#[from] StoreError),
#[error("io error on {path}: {source}")]
Io {
path: std::path::PathBuf,
#[source]
source: std::io::Error,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CacheComponent {
Blobs,
Views,
Lance,
GitCache,
Telemetry,
All,
}
impl CacheComponent {
pub fn as_str(self) -> &'static str {
match self {
CacheComponent::Blobs => "blobs",
CacheComponent::Views => "views",
CacheComponent::Lance => "lance",
CacheComponent::GitCache => "git-cache",
CacheComponent::Telemetry => "telemetry",
CacheComponent::All => "all",
}
}
}
impl std::str::FromStr for CacheComponent {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"blobs" => Ok(CacheComponent::Blobs),
"views" => Ok(CacheComponent::Views),
"lance" => Ok(CacheComponent::Lance),
"git-cache" => Ok(CacheComponent::GitCache),
"telemetry" => Ok(CacheComponent::Telemetry),
"all" => Ok(CacheComponent::All),
other => Err(format!(
"unknown cache component {other:?}; expected one of \
blobs|views|lance|git-cache|telemetry|all"
)),
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct GcReport {
pub scanned: usize,
pub removed: usize,
pub bytes_freed: u64,
}
#[derive(Debug, Clone, Serialize)]
pub struct CacheStats {
pub blobs_bytes: u64,
pub views_bytes: u64,
pub lance_bytes: u64,
pub git_cache_bytes: u64,
pub telemetry_bytes: u64,
pub blob_count: usize,
pub orphan_blob_count: usize,
pub per_view_file_count: Vec<(String, usize)>,
}
pub fn collect_referenced_hashes(basemind_dir: &Path) -> Result<AHashSet<String>, GcError> {
let mut referenced = AHashSet::new();
let views_dir = basemind_dir.join(VIEWS_DIR);
if !views_dir.exists() {
return Ok(referenced);
}
for entry in read_dir(&views_dir)? {
let entry = entry.map_err(|source| GcError::Io {
path: views_dir.clone(),
source,
})?;
let view_dir = entry.path();
if !view_dir.is_dir() {
continue;
}
if !view_dir.join(INDEX_FILE).exists() {
tracing::warn!(view = %view_dir.display(), "view has no index.msgpack; skipping");
continue;
}
let index = match read_index(&view_dir) {
Ok(Some(idx)) => idx,
Ok(None) => continue,
Err(e) => return Err(GcError::Store(e)),
};
for entry in index.files.values() {
referenced.insert(entry.hash_hex.clone());
}
}
Ok(referenced)
}
pub fn gc_blobs(basemind_dir: &Path, referenced: &AHashSet<String>) -> Result<GcReport, GcError> {
let blobs_dir = basemind_dir.join(BLOBS_DIR);
let mut report = GcReport {
scanned: 0,
removed: 0,
bytes_freed: 0,
};
if !blobs_dir.exists() {
return Ok(report);
}
for entry in read_dir(&blobs_dir)? {
let entry = entry.map_err(|source| GcError::Io {
path: blobs_dir.clone(),
source,
})?;
let path = entry.path();
if !path.is_file() {
continue;
}
let Some(file_name) = path.file_name().and_then(|n| n.to_str()) else {
continue;
};
let Some(stem) = blob_stem(file_name) else {
report.scanned += 1;
continue;
};
report.scanned += 1;
if referenced.contains(stem) {
continue;
}
let size = std::fs::metadata(&path)
.map_err(|source| GcError::Io {
path: path.clone(),
source,
})?
.len();
std::fs::remove_file(&path).map_err(|source| GcError::Io {
path: path.clone(),
source,
})?;
report.removed += 1;
report.bytes_freed += size;
}
Ok(report)
}
pub fn run_gc(basemind_dir: &Path) -> Result<GcReport, GcError> {
let _lock = acquire_lock(basemind_dir)?;
let referenced = collect_referenced_hashes(basemind_dir)?;
gc_blobs(basemind_dir, &referenced)
}
pub fn clear_component(basemind_dir: &Path, component: CacheComponent) -> Result<(), GcError> {
match component {
CacheComponent::Blobs => wipe_blobs(basemind_dir)?,
CacheComponent::Views => remove_dir_if_exists(&basemind_dir.join(VIEWS_DIR))?,
CacheComponent::Lance => clear_lance(basemind_dir)?,
CacheComponent::GitCache => {
remove_dir_if_exists(&basemind_dir.join(crate::git_cache::GIT_CACHE_DIR))?
}
CacheComponent::Telemetry => remove_file_if_exists(&basemind_dir.join(TELEMETRY_FILENAME))?,
CacheComponent::All => remove_dir_if_exists(basemind_dir)?,
}
Ok(())
}
pub fn cache_stats(basemind_dir: &Path) -> Result<CacheStats, GcError> {
let blobs_dir = basemind_dir.join(BLOBS_DIR);
let referenced = collect_referenced_hashes(basemind_dir)?;
let mut blob_count = 0usize;
let mut orphan_blob_count = 0usize;
if blobs_dir.exists() {
for entry in read_dir(&blobs_dir)? {
let entry = entry.map_err(|source| GcError::Io {
path: blobs_dir.clone(),
source,
})?;
let path = entry.path();
if !path.is_file() {
continue;
}
let Some(stem) = path
.file_name()
.and_then(|n| n.to_str())
.and_then(blob_stem)
else {
continue;
};
blob_count += 1;
if !referenced.contains(stem) {
orphan_blob_count += 1;
}
}
}
Ok(CacheStats {
blobs_bytes: dir_size(&blobs_dir)?,
views_bytes: dir_size(&basemind_dir.join(VIEWS_DIR))?,
lance_bytes: dir_size(&basemind_dir.join("lance"))?,
git_cache_bytes: dir_size(&basemind_dir.join(crate::git_cache::GIT_CACHE_DIR))?,
telemetry_bytes: file_size(&basemind_dir.join(TELEMETRY_FILENAME))?,
blob_count,
orphan_blob_count,
per_view_file_count: per_view_file_count(basemind_dir)?,
})
}
fn blob_stem(file_name: &str) -> Option<&str> {
BLOB_SUFFIXES
.iter()
.find_map(|suffix| file_name.strip_suffix(suffix))
}
fn per_view_file_count(basemind_dir: &Path) -> Result<Vec<(String, usize)>, GcError> {
let mut out = Vec::new();
let views_dir = basemind_dir.join(VIEWS_DIR);
if !views_dir.exists() {
return Ok(out);
}
for entry in read_dir(&views_dir)? {
let entry = entry.map_err(|source| GcError::Io {
path: views_dir.clone(),
source,
})?;
let view_dir = entry.path();
if !view_dir.is_dir() {
continue;
}
let name = view_dir
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("?")
.to_string();
let count = read_index(&view_dir)
.ok()
.flatten()
.map_or(0, |idx| idx.files.len());
out.push((name, count));
}
out.sort_by(|a, b| a.0.cmp(&b.0));
Ok(out)
}
#[cfg(feature = "intelligence")]
fn clear_lance(basemind_dir: &Path) -> Result<(), GcError> {
remove_dir_if_exists(&basemind_dir.join(crate::store::LANCE_DIR))
}
#[cfg(not(feature = "intelligence"))]
fn clear_lance(_basemind_dir: &Path) -> Result<(), GcError> {
Ok(())
}
fn remove_dir_if_exists(dir: &Path) -> Result<(), GcError> {
if dir.exists() {
std::fs::remove_dir_all(dir).map_err(|source| GcError::Io {
path: dir.to_path_buf(),
source,
})?;
}
Ok(())
}
fn remove_file_if_exists(path: &Path) -> Result<(), GcError> {
if path.exists() {
std::fs::remove_file(path).map_err(|source| GcError::Io {
path: path.to_path_buf(),
source,
})?;
}
Ok(())
}
fn read_dir(dir: &Path) -> Result<std::fs::ReadDir, GcError> {
std::fs::read_dir(dir).map_err(|source| GcError::Io {
path: dir.to_path_buf(),
source,
})
}
fn file_size(path: &Path) -> Result<u64, GcError> {
if !path.exists() {
return Ok(0);
}
Ok(std::fs::metadata(path)
.map_err(|source| GcError::Io {
path: path.to_path_buf(),
source,
})?
.len())
}
fn dir_size(dir: &Path) -> Result<u64, GcError> {
if !dir.exists() {
return Ok(0);
}
let mut total = 0u64;
for entry in read_dir(dir)? {
let entry = entry.map_err(|source| GcError::Io {
path: dir.to_path_buf(),
source,
})?;
let path = entry.path();
let meta = entry.metadata().map_err(|source| GcError::Io {
path: path.clone(),
source,
})?;
if meta.is_dir() {
total += dir_size(&path)?;
} else {
total += meta.len();
}
}
Ok(total)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::store::{FileEntry, INDEX_FILE, Index};
use std::fs;
use std::path::PathBuf;
struct Fixture {
_tmp: tempfile::TempDir,
basemind_dir: PathBuf,
referenced_stem: String,
orphan_stem: String,
orphan_len: u64,
}
fn build_fixture() -> Fixture {
let tmp = tempfile::tempdir().expect("tempdir");
let basemind_dir = tmp.path().join(".basemind");
let blobs = basemind_dir.join(BLOBS_DIR);
let working = basemind_dir.join(VIEWS_DIR).join("working");
fs::create_dir_all(&blobs).expect("mk blobs");
fs::create_dir_all(&working).expect("mk view");
let referenced_stem = "a".repeat(64);
let orphan_stem = "b".repeat(64);
fs::write(blobs.join(format!("{referenced_stem}.l1.msgpack")), b"l1")
.expect("write ref l1");
fs::write(blobs.join(format!("{referenced_stem}.l2.msgpack")), b"l2-")
.expect("write ref l2");
let orphan_bytes = b"orphan-blob-bytes";
let orphan_len = orphan_bytes.len() as u64;
fs::write(
blobs.join(format!("{orphan_stem}.l1.msgpack")),
orphan_bytes,
)
.expect("write orphan");
let mut index = Index::empty();
index.files.insert(
crate::path::RelPath::from("src/main.rs"),
FileEntry {
hash_hex: referenced_stem.clone(),
language: "rust".to_string(),
size_bytes: 2,
mtime: 0,
},
);
let bytes = rmp_serde::to_vec_named(&index).expect("encode index");
fs::write(working.join(INDEX_FILE), bytes).expect("write index");
Fixture {
_tmp: tmp,
basemind_dir,
referenced_stem,
orphan_stem,
orphan_len,
}
}
#[test]
fn should_collect_only_referenced_stem() {
let fx = build_fixture();
let referenced = collect_referenced_hashes(&fx.basemind_dir).expect("collect");
assert_eq!(referenced.len(), 1, "exactly one live stem");
assert!(
referenced.contains(&fx.referenced_stem),
"live stem present"
);
assert!(
!referenced.contains(&fx.orphan_stem),
"orphan stem must not be referenced"
);
}
#[test]
fn should_remove_only_orphan_blob() {
let fx = build_fixture();
let referenced = collect_referenced_hashes(&fx.basemind_dir).expect("collect");
let report = gc_blobs(&fx.basemind_dir, &referenced).expect("gc");
assert_eq!(report.scanned, 3, "two ref blobs + one orphan inspected");
assert_eq!(report.removed, 1, "only the orphan removed");
assert_eq!(
report.bytes_freed, fx.orphan_len,
"freed bytes equal the orphan's exact length"
);
let blobs = fx.basemind_dir.join(BLOBS_DIR);
assert!(
blobs
.join(format!("{}.l1.msgpack", fx.referenced_stem))
.exists(),
"referenced l1 survives"
);
assert!(
blobs
.join(format!("{}.l2.msgpack", fx.referenced_stem))
.exists(),
"referenced l2 survives"
);
assert!(
!blobs
.join(format!("{}.l1.msgpack", fx.orphan_stem))
.exists(),
"orphan l1 gone"
);
}
#[test]
fn should_report_one_orphan_before_gc_and_zero_after() {
let fx = build_fixture();
let before = cache_stats(&fx.basemind_dir).expect("stats before");
assert_eq!(before.blob_count, 3, "three blob files on disk");
assert_eq!(before.orphan_blob_count, 1, "one orphan before GC");
assert_eq!(
before.per_view_file_count,
vec![("working".to_string(), 1)],
"single working view with one indexed file"
);
run_gc(&fx.basemind_dir).expect("gc");
let after = cache_stats(&fx.basemind_dir).expect("stats after");
assert_eq!(after.blob_count, 2, "orphan reaped");
assert_eq!(after.orphan_blob_count, 0, "no orphans remain");
}
#[test]
fn should_clear_only_blobs_component() {
let fx = build_fixture();
fs::write(fx.basemind_dir.join(TELEMETRY_FILENAME), b"{}\n").expect("telemetry");
clear_component(&fx.basemind_dir, CacheComponent::Blobs).expect("clear blobs");
let blobs = fx.basemind_dir.join(BLOBS_DIR);
let remaining: Vec<_> = fs::read_dir(&blobs)
.expect("read blobs")
.filter_map(Result::ok)
.collect();
assert!(remaining.is_empty(), "blobs dir emptied: {remaining:?}");
assert!(blobs.exists(), "blobs dir itself preserved");
assert!(
fx.basemind_dir
.join(VIEWS_DIR)
.join("working")
.join(INDEX_FILE)
.exists(),
"view index untouched by Blobs clear"
);
assert!(
fx.basemind_dir.join(TELEMETRY_FILENAME).exists(),
"telemetry untouched by Blobs clear"
);
}
#[test]
fn should_round_trip_component_tokens() {
for component in [
CacheComponent::Blobs,
CacheComponent::Views,
CacheComponent::Lance,
CacheComponent::GitCache,
CacheComponent::Telemetry,
CacheComponent::All,
] {
let token = component.as_str();
let parsed: CacheComponent = token.parse().expect("parse token");
assert_eq!(parsed, component, "round-trip {token}");
}
assert!(
"nonsense".parse::<CacheComponent>().is_err(),
"unknown token rejected"
);
}
}