use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct PersistedIndex {
pub id: String,
pub root_path: PathBuf,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub include_paths: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub exclude_globs: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub extensions: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub domain_terms: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub path_filter: Vec<String>,
#[serde(default = "default_include_docs", skip_serializing_if = "is_true")]
pub include_docs: bool,
#[serde(
default = "default_respect_gitignore",
skip_serializing_if = "is_default_respect_gitignore"
)]
pub respect_gitignore: bool,
#[serde(default = "default_extra_skip_dirs")]
pub extra_skip_dirs: Vec<String>,
#[serde(default = "default_data_file_max_bytes")]
pub data_file_max_bytes: Option<u64>,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub lexical_only: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub skip_kg: bool,
#[serde(default = "default_defer_embed", skip_serializing_if = "is_true")]
pub defer_embed: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub colocated: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_queried_unix: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_indexed_unix: Option<u64>,
}
fn default_respect_gitignore() -> bool {
true
}
fn default_defer_embed() -> bool {
true
}
fn default_extra_skip_dirs() -> Vec<String> {
crate::service::walker::default_extra_skip_dirs()
}
fn default_data_file_max_bytes() -> Option<u64> {
Some(crate::service::walker::DEFAULT_DATA_FILE_MAX_BYTES)
}
pub fn resolve_data_file_max_bytes(stored: Option<u64>) -> u64 {
stored.unwrap_or(crate::service::walker::DEFAULT_DATA_FILE_MAX_BYTES)
}
fn default_include_docs() -> bool {
true
}
fn is_true(v: &bool) -> bool {
*v
}
fn is_default_respect_gitignore(v: &bool) -> bool {
*v
}
impl Default for PersistedIndex {
fn default() -> Self {
Self {
id: String::new(),
root_path: PathBuf::new(),
include_paths: Vec::new(),
exclude_globs: Vec::new(),
extensions: Vec::new(),
domain_terms: Vec::new(),
path_filter: Vec::new(),
include_docs: true,
respect_gitignore: true,
extra_skip_dirs: default_extra_skip_dirs(),
data_file_max_bytes: default_data_file_max_bytes(),
lexical_only: false,
skip_kg: false,
defer_embed: true,
colocated: false,
last_queried_unix: None,
last_indexed_unix: None,
}
}
}
pub use super::persistence_timestamps::{
read_last_queried_unix, update_last_indexed_unix, update_last_queried_unix, warmboot_sort_key,
};
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
pub struct IndexRegistryFile {
#[serde(default, rename = "index")]
pub indexes: Vec<PersistedIndex>,
}
pub fn data_dir() -> Result<PathBuf> {
if let Ok(override_dir) = std::env::var("TRUSTY_DATA_DIR") {
let dir = PathBuf::from(&override_dir);
anyhow::ensure!(
dir.is_absolute(),
"TRUSTY_DATA_DIR must be an absolute path (got: {})",
override_dir
);
std::fs::create_dir_all(&dir).context("create TRUSTY_DATA_DIR data dir")?;
tracing::debug!("data_dir: TRUSTY_DATA_DIR override: {}", dir.display());
return Ok(dir);
}
if let Some(base) = dirs::data_local_dir() {
let dir = base.join("trusty-search");
std::fs::create_dir_all(&dir).context("create trusty-search data dir")?;
tracing::debug!("data_dir: dirs::data_local_dir: {}", dir.display());
return Ok(dir);
}
super::data_dir::data_dir_home_fallback()
}
pub fn indexes_toml_path() -> Result<PathBuf> {
Ok(data_dir()?.join("indexes.toml"))
}
pub fn index_data_dir(index_id: &str) -> Result<PathBuf> {
let dir = data_dir()?.join("indexes").join(sanitize_id(index_id));
std::fs::create_dir_all(&dir).context("create per-index data dir")?;
Ok(dir)
}
pub(crate) fn sanitize_id_for_path(id: &str) -> String {
sanitize_id(id)
}
fn sanitize_id(id: &str) -> String {
id.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '.' || c == '_' || c == '-' {
c
} else {
'_'
}
})
.collect()
}
pub fn hnsw_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("hnsw.usearch"))
}
pub fn chunks_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("chunks.json"))
}
pub fn corpus_redb_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("index.redb"))
}
pub fn schema_version_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("schema_version.json"))
}
pub fn corpus_redb_tmp_path(index_id: &str) -> Result<PathBuf> {
Ok(index_data_dir(index_id)?.join("index.redb.tmp"))
}
pub fn hnsw_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_hnsw_path(&entry.root_path)
} else {
hnsw_path(&entry.id)
}
}
pub fn corpus_redb_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_redb_path(&entry.root_path)
} else {
corpus_redb_path(&entry.id)
}
}
pub fn schema_version_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_schema_version_path(&entry.root_path)
} else {
schema_version_path(&entry.id)
}
}
pub fn corpus_redb_tmp_path_for_entry(entry: &PersistedIndex) -> Result<PathBuf> {
if entry.colocated {
crate::service::colocated_storage::colocated_redb_tmp_path(&entry.root_path)
} else {
corpus_redb_tmp_path(&entry.id)
}
}
pub fn load_index_registry() -> Result<Vec<PersistedIndex>> {
load_index_registry_at(&indexes_toml_path()?)
}
pub fn load_index_registry_at(path: &Path) -> Result<Vec<PersistedIndex>> {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
Err(e) => return Err(e).context("read indexes.toml"),
};
match toml::from_str::<IndexRegistryFile>(&content) {
Ok(file) => Ok(file.indexes),
Err(e) => {
tracing::warn!(
"indexes.toml at {} is corrupt ({e}); starting with empty registry",
path.display()
);
Ok(Vec::new())
}
}
}
pub fn save_index_registry(entries: &[PersistedIndex]) -> Result<()> {
save_index_registry_at(&indexes_toml_path()?, entries)
}
pub fn save_index_registry_at(path: &Path, entries: &[PersistedIndex]) -> Result<()> {
let file = IndexRegistryFile {
indexes: entries.to_vec(),
};
let serialized = toml::to_string_pretty(&file).context("serialize indexes.toml")?;
let tmp = path.with_extension("toml.tmp");
std::fs::write(&tmp, serialized).context("write indexes.toml tmp")?;
std::fs::rename(&tmp, path).context("rename indexes.toml")?;
Ok(())
}
pub fn upsert_index_registry_entry(entry: PersistedIndex) -> Result<()> {
upsert_index_registry_entry_at(&indexes_toml_path()?, entry)
}
pub fn upsert_index_registry_entry_at(path: &Path, entry: PersistedIndex) -> Result<()> {
let mut entries = load_index_registry_at(path)?;
if let Some(existing) = entries.iter_mut().find(|e| e.id == entry.id) {
*existing = entry;
} else {
entries.push(entry);
}
save_index_registry_at(path, &entries)
}
pub fn remove_index_registry_entry(id: &str) -> Result<()> {
remove_index_registry_entry_at(&indexes_toml_path()?, id)
}
pub fn remove_index_registry_entry_at(path: &Path, id: &str) -> Result<()> {
let mut entries = load_index_registry_at(path)?;
let before = entries.len();
entries.retain(|e| e.id != id);
if entries.len() == before {
return Ok(());
}
save_index_registry_at(path, &entries)
}
pub fn remove_index_data_dir(index_id: &str) -> Result<()> {
let dir = data_dir()?.join("indexes").join(sanitize_id(index_id));
if dir.exists() {
std::fs::remove_dir_all(&dir).with_context(|| format!("remove {}", dir.display()))?;
}
Ok(())
}
pub fn has_persisted_hnsw(path: &Path) -> bool {
path.exists() && path.is_file()
}
#[cfg(test)]
#[path = "persistence_tests.rs"]
mod tests;