use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct PersistedIndex {
pub id: String,
pub root_path: PathBuf,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub include_paths: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub exclude_globs: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub extensions: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub domain_terms: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub path_filter: Vec<String>,
#[serde(default = "default_include_docs", skip_serializing_if = "is_true")]
pub include_docs: bool,
#[serde(
default = "default_respect_gitignore",
skip_serializing_if = "is_default_respect_gitignore"
)]
pub respect_gitignore: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub lexical_only: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub skip_kg: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub colocated: bool,
}
fn default_respect_gitignore() -> bool {
true
}
fn default_include_docs() -> bool {
true
}
fn is_true(v: &bool) -> bool {
*v
}
fn is_default_respect_gitignore(v: &bool) -> bool {
*v
}
impl Default for PersistedIndex {
fn default() -> Self {
Self {
id: String::new(),
root_path: PathBuf::new(),
include_paths: Vec::new(),
exclude_globs: Vec::new(),
extensions: Vec::new(),
domain_terms: Vec::new(),
path_filter: Vec::new(),
include_docs: true,
respect_gitignore: true,
lexical_only: false,
skip_kg: false,
colocated: false,
}
}
}
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct IndexRegistryFile {
#[serde(default, rename = "index")]
pub indexes: Vec<PersistedIndex>,
}
pub fn data_dir() -> Result<PathBuf> {
if let Ok(override_dir) = std::env::var("TRUSTY_DATA_DIR") {
let dir = PathBuf::from(override_dir);
std::fs::create_dir_all(&dir).context("create TRUSTY_DATA_DIR data dir")?;
return Ok(dir);
}
let dir = dirs::data_local_dir()
.context("could not determine data-local directory")?
.join("trusty-search");
std::fs::create_dir_all(&dir).context("create trusty-search data dir")?;
Ok(dir)
}
pub fn indexes_toml_path() -> Result<PathBuf> {
Ok(data_dir()?.join("indexes.toml"))
}
pub fn index_data_dir(index_id: &str) -> Result<PathBuf> {
let dir = data_dir()?.join("indexes").join(sanitize_id(index_id));
std::fs::create_dir_all(&dir).context("create per-index data dir")?;
Ok(dir)
}
pub(crate) fn sanitize_id_for_path(id: &str) -> String {
sanitize_id(id)
}
fn sanitize_id(id: &str) -> String {
id.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '.' || c == '_' || c == '-' {
c
} else {
'_'
}
})
.collect()
}
pub fn hnsw_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("hnsw.usearch"))
}
pub fn chunks_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("chunks.json"))
}
pub fn corpus_redb_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("index.redb"))
}
pub fn schema_version_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("schema_version.json"))
}
pub fn corpus_redb_tmp_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("index.redb.tmp"))
}
pub fn hnsw_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_hnsw_path(&entry.root_path)
} else {
hnsw_path(&entry.id)
}
}
pub fn corpus_redb_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_redb_path(&entry.root_path)
} else {
corpus_redb_path(&entry.id)
}
}
pub fn schema_version_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_schema_version_path(&entry.root_path)
} else {
schema_version_path(&entry.id)
}
}
pub fn corpus_redb_tmp_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_redb_tmp_path(&entry.root_path)
} else {
corpus_redb_tmp_path(&entry.id)
}
}
pub fn load_index_registry() -> Result<Vec<PersistedIndex>> {
load_index_registry_at(&indexes_toml_path()?)
}
pub fn load_index_registry_at(path: &Path) -> Result<Vec<PersistedIndex>> {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
Err(e) => return Err(e).context("read indexes.toml"),
};
match toml::from_str::<IndexRegistryFile>(&content) {
Ok(file) => Ok(file.indexes),
Err(e) => {
tracing::warn!(
"indexes.toml at {} is corrupt ({e}); starting with empty registry",
path.display()
);
Ok(Vec::new())
}
}
}
pub fn save_index_registry(entries: &[PersistedIndex]) -> Result<()> {
save_index_registry_at(&indexes_toml_path()?, entries)
}
pub fn save_index_registry_at(path: &Path, entries: &[PersistedIndex]) -> Result<()> {
let file = IndexRegistryFile {
indexes: entries.to_vec(),
};
let serialized = toml::to_string_pretty(&file).context("serialize indexes.toml")?;
let tmp = path.with_extension("toml.tmp");
std::fs::write(&tmp, serialized).context("write indexes.toml tmp")?;
std::fs::rename(&tmp, path).context("rename indexes.toml")?;
Ok(())
}
pub fn upsert_index_registry_entry(entry: PersistedIndex) -> Result<()> {
upsert_index_registry_entry_at(&indexes_toml_path()?, entry)
}
pub fn upsert_index_registry_entry_at(path: &Path, entry: PersistedIndex) -> Result<()> {
let mut entries = load_index_registry_at(path)?;
if let Some(existing) = entries.iter_mut().find(|e| e.id == entry.id) {
*existing = entry;
} else {
entries.push(entry);
}
save_index_registry_at(path, &entries)
}
pub fn remove_index_registry_entry(id: &str) -> Result<()> {
remove_index_registry_entry_at(&indexes_toml_path()?, id)
}
pub fn remove_index_registry_entry_at(path: &Path, id: &str) -> Result<()> {
let mut entries = load_index_registry_at(path)?;
let before = entries.len();
entries.retain(|e| e.id != id);
if entries.len() == before {
return Ok(());
}
save_index_registry_at(path, &entries)
}
pub fn remove_index_data_dir(index_id: &str) -> Result<()> {
let dir = data_dir()?.join("indexes").join(sanitize_id(index_id));
if dir.exists() {
std::fs::remove_dir_all(&dir).with_context(|| format!("remove {}", dir.display()))?;
}
Ok(())
}
pub fn has_persisted_hnsw(path: &Path) -> bool {
path.exists() && path.is_file()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sanitize_strips_unsafe_chars() {
assert_eq!(sanitize_id("good-name_1.0"), "good-name_1.0");
assert_eq!(sanitize_id("../escape"), ".._escape");
assert_eq!(sanitize_id("with spaces/slash"), "with_spaces_slash");
}
#[test]
fn registry_file_serde_roundtrip() {
let file = IndexRegistryFile {
indexes: vec![
PersistedIndex {
id: "a".into(),
root_path: PathBuf::from("/tmp/a"),
..Default::default()
},
PersistedIndex {
id: "b".into(),
root_path: PathBuf::from("/tmp/b"),
..Default::default()
},
],
};
let s = toml::to_string_pretty(&file).unwrap();
let parsed: IndexRegistryFile = toml::from_str(&s).unwrap();
assert_eq!(parsed.indexes, file.indexes);
}
#[test]
fn remove_index_persists_to_toml() {
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
upsert_index_registry_entry_at(
&path,
PersistedIndex {
id: "keep".into(),
root_path: PathBuf::from("/tmp/keep"),
..Default::default()
},
)
.unwrap();
upsert_index_registry_entry_at(
&path,
PersistedIndex {
id: "drop".into(),
root_path: PathBuf::from("/tmp/drop"),
..Default::default()
},
)
.unwrap();
assert_eq!(load_index_registry_at(&path).unwrap().len(), 2);
remove_index_registry_entry_at(&path, "drop").unwrap();
let restored = load_index_registry_at(&path).unwrap();
assert_eq!(restored.len(), 1);
assert_eq!(restored[0].id, "keep");
assert!(restored.iter().all(|e| e.id != "drop"));
remove_index_registry_entry_at(&path, "drop").unwrap();
assert_eq!(load_index_registry_at(&path).unwrap().len(), 1);
}
#[test]
fn upsert_index_dedupes_on_id() {
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
upsert_index_registry_entry_at(
&path,
PersistedIndex {
id: "proj".into(),
root_path: PathBuf::from("/old"),
..Default::default()
},
)
.unwrap();
upsert_index_registry_entry_at(
&path,
PersistedIndex {
id: "proj".into(),
root_path: PathBuf::from("/new"),
..Default::default()
},
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1, "duplicate [[index]] block written");
assert_eq!(entries[0].root_path, PathBuf::from("/new"));
}
#[test]
fn respect_gitignore_defaults_true_and_round_trips() {
assert!(PersistedIndex::default().respect_gitignore);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
std::fs::write(
&path,
r#"
[[index]]
id = "legacy"
root_path = "/tmp/legacy"
"#,
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(
entries[0].respect_gitignore,
"missing field must default to true (issue #100 back-compat)"
);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "vendored".into(),
root_path: PathBuf::from("/tmp/v"),
respect_gitignore: false,
..Default::default()
}],
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(!entries[0].respect_gitignore);
}
#[test]
fn include_docs_defaults_true_and_round_trips() {
assert!(PersistedIndex::default().include_docs);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
std::fs::write(
&path,
r#"
[[index]]
id = "legacy"
root_path = "/tmp/legacy"
"#,
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(
entries[0].include_docs,
"missing field must default to true (issue #118 migration)"
);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "docs_off".into(),
root_path: PathBuf::from("/tmp/v"),
include_docs: false,
..Default::default()
}],
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(!entries[0].include_docs);
}
#[test]
fn lexical_only_round_trips() {
assert!(!PersistedIndex::default().lexical_only);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
std::fs::write(
&path,
r#"
[[index]]
id = "legacy"
root_path = "/tmp/legacy"
"#,
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(
!entries[0].lexical_only,
"missing field must default to false (issue #109 back-compat)"
);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "lex_only".into(),
root_path: PathBuf::from("/tmp/v"),
lexical_only: true,
..Default::default()
}],
)
.unwrap();
let s = std::fs::read_to_string(&path).unwrap();
assert!(
s.contains("lexical_only"),
"explicit true must be serialised — TOML was: {s}"
);
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(entries[0].lexical_only);
}
#[test]
fn skip_kg_round_trips() {
assert!(!PersistedIndex::default().skip_kg);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
std::fs::write(
&path,
r#"
[[index]]
id = "legacy"
root_path = "/tmp/legacy"
"#,
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(
!entries[0].skip_kg,
"missing field must default to false (issue #313 back-compat)"
);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "no_kg".into(),
root_path: PathBuf::from("/tmp/v"),
skip_kg: true,
..Default::default()
}],
)
.unwrap();
let s = std::fs::read_to_string(&path).unwrap();
assert!(
s.contains("skip_kg"),
"explicit true must be serialised — TOML was: {s}"
);
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(entries[0].skip_kg);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "both_flags".into(),
root_path: PathBuf::from("/tmp/v"),
lexical_only: true,
skip_kg: true,
..Default::default()
}],
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(entries[0].lexical_only, "lexical_only preserved");
assert!(entries[0].skip_kg, "skip_kg preserved");
}
#[test]
fn colocated_flag_round_trips() {
assert!(!PersistedIndex::default().colocated);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
std::fs::write(
&path,
r#"
[[index]]
id = "legacy"
root_path = "/tmp/legacy_col"
"#,
)
.unwrap();
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(
!entries[0].colocated,
"missing field must default to false (issue #403 back-compat)"
);
let tmp = tempfile::NamedTempFile::new().unwrap();
let path = tmp.path().to_path_buf();
let root_dir = tempfile::tempdir().unwrap();
save_index_registry_at(
&path,
&[PersistedIndex {
id: "colocated_idx".into(),
root_path: root_dir.path().to_path_buf(),
colocated: true,
..Default::default()
}],
)
.unwrap();
let s = std::fs::read_to_string(&path).unwrap();
assert!(
s.contains("colocated"),
"explicit true must be serialised — TOML was: {s}"
);
let entries = load_index_registry_at(&path).unwrap();
assert_eq!(entries.len(), 1);
assert!(entries[0].colocated);
let hnsw = super::hnsw_path_for_entry(&entries[0]).unwrap();
assert!(
hnsw.starts_with(root_dir.path()),
"colocated hnsw path must be inside root; got {hnsw:?}"
);
let redb = super::corpus_redb_path_for_entry(&entries[0]).unwrap();
assert!(
redb.starts_with(root_dir.path()),
"colocated redb path must be inside root; got {redb:?}"
);
}
#[test]
fn registry_upsert_idempotent_unit() {
let mut entries = vec![PersistedIndex {
id: "a".into(),
root_path: PathBuf::from("/old"),
..Default::default()
}];
let new = PersistedIndex {
id: "a".into(),
root_path: PathBuf::from("/new"),
..Default::default()
};
if let Some(existing) = entries.iter_mut().find(|e| e.id == new.id) {
existing.root_path = new.root_path.clone();
} else {
entries.push(new);
}
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].root_path, PathBuf::from("/new"));
}
#[test]
fn data_dir_respects_trusty_data_dir_env_var() {
let tmp = tempfile::tempdir().unwrap();
let override_path = tmp.path().to_path_buf();
let unique = override_path.join("persistence_data_dir_test");
std::fs::create_dir_all(&unique).unwrap();
unsafe {
std::env::set_var("TRUSTY_DATA_DIR", &unique);
}
let result = data_dir();
unsafe {
std::env::remove_var("TRUSTY_DATA_DIR");
}
let dir = result.expect("data_dir with TRUSTY_DATA_DIR must succeed");
assert_eq!(dir, unique, "data_dir() should return the override path");
assert!(
dir.exists(),
"data_dir() should ensure the directory exists"
);
}
}